mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-20 19:03:51 +08:00
refactor: remove smart_search bridge, add codexlens MCP template, delete codex-lens v1
- Delete smart-search.ts (3476 lines) and codex-lens.ts stub — the CCW
bridge that wrapped the codexlens-search CLI is gone entirely
- Remove executeToolWithProgress and all smart_search registrations from
tools/index.ts and mcp-server/index.ts
- Replace checkSemanticStatus() calls in core-memory-routes with inline
{ available: false } — v1 bridge no longer provides this
- Inline no-op stubs in smart-context.ts to replace codex-lens imports
- Seed built-in 'codexlens' MCP template at server startup via
seedBuiltinTemplates() in mcp-routes.ts; uses uvx --from
codexlens-search[mcp] codexlens-mcp so users install via uv
- Remove smart_search from all default enabled-tools strings (backend
mcp-routes, mcp-server DEFAULT_TOOLS, frontend api.ts, mcp install
helpers) and CCW_MCP_TOOLS UI list
- Delete frontend pages/hooks/components: CodexLensManagerPage,
useV2SearchManager, useIndex, IndexManager; remove routes, sidebar
entry, and all re-exports
- Remove index status display section from WorkflowTaskWidget
- Delete four smart-search test files; update mcp-server.test.js and
e2e/mcp-tools.e2e.test.ts to remove smart_search assertions
- Delete codex-lens/ source directory (v1 Python monolith, ~75 files)
— no longer imported or subprocess-called by CCW
Net: ~11 000 lines removed, +30 lines for template seeding
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
@@ -1,71 +0,0 @@
|
||||
# CodexLens Environment Configuration
|
||||
#
|
||||
# Configuration locations (copy to one of these):
|
||||
# - ~/.codexlens/.env (global, applies to all projects)
|
||||
# - project/.codexlens/.env (workspace-local)
|
||||
# - project/.env (project root)
|
||||
#
|
||||
# Priority order (later overrides earlier):
|
||||
# 1. Environment variables (already set in shell) - highest
|
||||
# 2. .codexlens/.env (workspace-local)
|
||||
# 3. .env (project root)
|
||||
# 4. ~/.codexlens/.env (global) - lowest
|
||||
|
||||
# ============================================
|
||||
# RERANKER Configuration
|
||||
# ============================================
|
||||
|
||||
# API key for reranker service (SiliconFlow/Cohere/Jina)
|
||||
# Required for 'api' backend
|
||||
# RERANKER_API_KEY=sk-xxxx
|
||||
|
||||
# Base URL for reranker API (overrides provider default)
|
||||
# SiliconFlow: https://api.siliconflow.cn
|
||||
# Cohere: https://api.cohere.ai
|
||||
# Jina: https://api.jina.ai
|
||||
# RERANKER_API_BASE=https://api.siliconflow.cn
|
||||
|
||||
# Reranker provider: siliconflow, cohere, jina
|
||||
# RERANKER_PROVIDER=siliconflow
|
||||
|
||||
# Reranker model name
|
||||
# SiliconFlow: BAAI/bge-reranker-v2-m3
|
||||
# Cohere: rerank-english-v3.0
|
||||
# Jina: jina-reranker-v2-base-multilingual
|
||||
# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
|
||||
|
||||
# ============================================
|
||||
# EMBEDDING Configuration
|
||||
# ============================================
|
||||
|
||||
# API key for embedding service (for litellm backend)
|
||||
# EMBEDDING_API_KEY=sk-xxxx
|
||||
|
||||
# Base URL for embedding API
|
||||
# EMBEDDING_API_BASE=https://api.openai.com
|
||||
|
||||
# Embedding model name
|
||||
# EMBEDDING_MODEL=text-embedding-3-small
|
||||
|
||||
# ============================================
|
||||
# LITELLM Configuration
|
||||
# ============================================
|
||||
|
||||
# API key for LiteLLM (for litellm reranker backend)
|
||||
# LITELLM_API_KEY=sk-xxxx
|
||||
|
||||
# Base URL for LiteLLM
|
||||
# LITELLM_API_BASE=
|
||||
|
||||
# LiteLLM model name
|
||||
# LITELLM_MODEL=gpt-4o-mini
|
||||
|
||||
# ============================================
|
||||
# General Configuration
|
||||
# ============================================
|
||||
|
||||
# Custom data directory path (default: ~/.codexlens)
|
||||
# CODEXLENS_DATA_DIR=~/.codexlens
|
||||
|
||||
# Enable debug mode (true/false)
|
||||
# CODEXLENS_DEBUG=false
|
||||
70
codex-lens/.github/workflows/security.yml
vendored
70
codex-lens/.github/workflows/security.yml
vendored
@@ -1,70 +0,0 @@
|
||||
# Security scanning workflow for codex-lens
|
||||
# Runs pip-audit to check for known vulnerabilities in dependencies
|
||||
|
||||
name: Security Scan
|
||||
|
||||
on:
|
||||
# Run on push to main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
# Run weekly on Sundays at 00:00 UTC
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
# Allow manual trigger
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
security-audit:
|
||||
name: Dependency Vulnerability Scan
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install pip-audit
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pip-audit
|
||||
|
||||
- name: Run pip-audit on requirements.in
|
||||
run: pip-audit --requirement requirements.in
|
||||
continue-on-error: false
|
||||
|
||||
- name: Run pip-audit on pyproject.toml dependencies
|
||||
run: pip-audit --project-path .
|
||||
continue-on-error: false
|
||||
|
||||
- name: Check for safety issues
|
||||
run: |
|
||||
pip install safety
|
||||
safety check --json || true
|
||||
continue-on-error: true
|
||||
|
||||
bandit-security:
|
||||
name: Code Security Linting
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install bandit
|
||||
run: pip install bandit[toml]
|
||||
|
||||
- name: Run bandit security linter
|
||||
run: bandit -r src/ -ll -i
|
||||
continue-on-error: true
|
||||
@@ -1 +0,0 @@
|
||||
{"ignore_patterns": ["frontend/dist"], "extension_filters": ["*.min.js"]}
|
||||
@@ -1 +0,0 @@
|
||||
export const app = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const bundle = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const compiled = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const bundle = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const app = 1
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
print('ok')
|
||||
@@ -1 +0,0 @@
|
||||
print('artifact')
|
||||
@@ -1 +0,0 @@
|
||||
export const app = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const bundle = 1
|
||||
@@ -1 +0,0 @@
|
||||
export const skip = 1
|
||||
@@ -1 +0,0 @@
|
||||
{"ignore_patterns": ["frontend/dist", "coverage"], "extension_filters": ["*.min.js", "*.map"]}
|
||||
@@ -1 +0,0 @@
|
||||
print('compiled')
|
||||
@@ -1,240 +0,0 @@
|
||||
# Association Tree Implementation Summary
|
||||
|
||||
## Overview
|
||||
|
||||
Successfully implemented LSP-based association tree search for CodexLens. The implementation consists of two core components that work together to discover and rank code relationships using Language Server Protocol (LSP) call hierarchy capabilities.
|
||||
|
||||
## Components Implemented
|
||||
|
||||
### 1. AssociationTreeBuilder (`src/codexlens/search/association_tree/builder.py`)
|
||||
|
||||
**Purpose**: Build call relationship trees from seed locations using LSP
|
||||
|
||||
**Key Features**:
|
||||
- Depth-first recursive expansion from seed positions
|
||||
- Supports bidirectional expansion:
|
||||
- Incoming calls (callers) - who calls this function
|
||||
- Outgoing calls (callees) - what this function calls
|
||||
- Automatic cycle detection and marking
|
||||
- Configurable max depth (default: 5)
|
||||
- Async/await with parallel expansion
|
||||
- Timeout handling (5s per LSP request)
|
||||
- Graceful error handling
|
||||
|
||||
**Core Methods**:
|
||||
- `build_tree()`: Main entry point for tree construction
|
||||
- `_expand_node()`: Recursive DFS expansion
|
||||
- `_expand_incoming_calls()`: Process callers
|
||||
- `_expand_outgoing_calls()`: Process callees
|
||||
|
||||
### 2. ResultDeduplicator (`src/codexlens/search/association_tree/deduplicator.py`)
|
||||
|
||||
**Purpose**: Extract unique nodes from trees and assign relevance scores
|
||||
|
||||
**Scoring Algorithm**:
|
||||
```
|
||||
Score = 0.4 * depth_score + 0.3 * frequency_score + 0.3 * kind_score
|
||||
|
||||
where:
|
||||
- depth_score: 1.0 at depth 0, decreasing to 0.0 at depth 10
|
||||
- frequency_score: occurrences / max_occurrences
|
||||
- kind_score: function/method (1.0) > class (0.8) > variable (0.4)
|
||||
```
|
||||
|
||||
**Key Features**:
|
||||
- Deduplication by (file_path, start_line, end_line)
|
||||
- Merge duplicate nodes across different paths
|
||||
- Track minimum depth and occurrence count
|
||||
- Configurable score weights
|
||||
- Filter by kind or file pattern
|
||||
- JSON serialization support
|
||||
|
||||
### 3. Data Structures (`src/codexlens/search/association_tree/data_structures.py`)
|
||||
|
||||
**TreeNode**:
|
||||
- Represents a single node in the call tree
|
||||
- Tracks depth, parents, children, paths
|
||||
- Marks circular references
|
||||
|
||||
**CallTree**:
|
||||
- Complete tree structure with roots and edges
|
||||
- Node lookup by ID
|
||||
- Edge tracking for relationship visualization
|
||||
|
||||
**UniqueNode**:
|
||||
- Deduplicated result with metadata
|
||||
- Aggregates multiple occurrences
|
||||
- Contains relevance score
|
||||
|
||||
## Integration with StandaloneLspManager
|
||||
|
||||
Extended `StandaloneLspManager` with missing method:
|
||||
|
||||
**Added**: `get_outgoing_calls()` method (`src/codexlens/lsp/standalone_manager.py:1057-1086`)
|
||||
|
||||
This method complements the existing `get_incoming_calls()` to enable bidirectional call tree traversal.
|
||||
|
||||
## Testing
|
||||
|
||||
Comprehensive test suite with 9 tests covering:
|
||||
|
||||
1. **Simple tree building**: Basic tree construction
|
||||
2. **Cycle detection**: Circular reference handling
|
||||
3. **Max depth limits**: Depth boundary enforcement
|
||||
4. **Empty trees**: Edge case handling
|
||||
5. **Basic deduplication**: Node merging logic
|
||||
6. **Scoring algorithm**: Relevance ranking
|
||||
7. **Max results limit**: Result pagination
|
||||
8. **Kind filtering**: Symbol type filtering
|
||||
9. **Serialization**: JSON export
|
||||
|
||||
**Test Results**: All 9 tests passing ✅
|
||||
|
||||
**Test File**: `tests/test_association_tree.py`
|
||||
|
||||
## Usage Example
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from codexlens.lsp.standalone_manager import StandaloneLspManager
|
||||
from codexlens.search.association_tree import (
|
||||
AssociationTreeBuilder,
|
||||
ResultDeduplicator,
|
||||
)
|
||||
|
||||
async def search_with_association_tree(file_path: str, line: int):
|
||||
async with StandaloneLspManager(workspace_root="/path/to/project") as lsp:
|
||||
# Build tree
|
||||
builder = AssociationTreeBuilder(lsp)
|
||||
tree = await builder.build_tree(
|
||||
seed_file_path=file_path,
|
||||
seed_line=line,
|
||||
max_depth=5,
|
||||
expand_callers=True,
|
||||
expand_callees=True,
|
||||
)
|
||||
|
||||
# Deduplicate and score
|
||||
deduplicator = ResultDeduplicator()
|
||||
unique_nodes = deduplicator.deduplicate(tree, max_results=20)
|
||||
|
||||
# Return results
|
||||
return deduplicator.to_dict_list(unique_nodes)
|
||||
|
||||
# Run
|
||||
results = asyncio.run(search_with_association_tree("src/main.py", 42))
|
||||
```
|
||||
|
||||
## Integration Point
|
||||
|
||||
The components can be integrated into `HybridSearchEngine`:
|
||||
|
||||
```python
|
||||
# In hybrid_search.py
|
||||
async def _search_association_tree(self, query: str, limit: int):
|
||||
# 1. Get seed results from vector search
|
||||
seed_results = await self._search_vector(query, limit=5)
|
||||
|
||||
# 2. Build association trees
|
||||
builder = AssociationTreeBuilder(self.lsp_manager)
|
||||
tree = await builder.build_tree(
|
||||
seed_file_path=seed_results[0].file_path,
|
||||
seed_line=seed_results[0].line,
|
||||
max_depth=5,
|
||||
)
|
||||
|
||||
# 3. Deduplicate and rank
|
||||
deduplicator = ResultDeduplicator()
|
||||
unique_nodes = deduplicator.deduplicate(tree, max_results=limit)
|
||||
|
||||
# 4. Convert to search results
|
||||
return self._convert_to_search_results(unique_nodes)
|
||||
```
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/codexlens/search/association_tree/
|
||||
├── __init__.py # Module exports
|
||||
├── builder.py # AssociationTreeBuilder
|
||||
├── data_structures.py # TreeNode, CallTree, UniqueNode
|
||||
├── deduplicator.py # ResultDeduplicator
|
||||
└── README.md # Documentation
|
||||
|
||||
tests/
|
||||
└── test_association_tree.py # Unit tests (9 tests)
|
||||
|
||||
examples/
|
||||
└── association_tree_demo.py # Demo script
|
||||
```
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
**Time Complexity**:
|
||||
- Tree building: O(nodes * avg_calls) with max_depth limit
|
||||
- Deduplication: O(n log n) for sorting
|
||||
|
||||
**Space Complexity**:
|
||||
- Tree: O(nodes + edges)
|
||||
- Unique nodes: O(unique_symbols)
|
||||
|
||||
**Typical Performance** (max_depth=5):
|
||||
- Small codebase: < 1s
|
||||
- Medium codebase: 1-3s
|
||||
- Large codebase: 3-10s
|
||||
|
||||
**Optimization Strategies**:
|
||||
1. Limit max_depth (recommended: 3-5)
|
||||
2. Use timeouts (default: 5s per node)
|
||||
3. Enable parallel expansion (default: on)
|
||||
4. Filter by symbol kind early
|
||||
|
||||
## Error Handling
|
||||
|
||||
The implementation handles:
|
||||
- ✅ LSP timeouts (logs warning, continues)
|
||||
- ✅ Missing call hierarchy support (returns empty tree)
|
||||
- ✅ Connection failures (skips node, continues)
|
||||
- ✅ Invalid LSP responses (logs error, skips)
|
||||
- ✅ Circular references (marks cycle, stops recursion)
|
||||
- ✅ Max depth exceeded (stops expansion)
|
||||
|
||||
## Code Quality
|
||||
|
||||
**Code Style**:
|
||||
- Python 3.10+ features (type hints, dataclasses)
|
||||
- Follows existing CodexLens conventions
|
||||
- Comprehensive docstrings
|
||||
- Async/await throughout
|
||||
|
||||
**Testing**:
|
||||
- 9 unit tests with mock LSP
|
||||
- Edge cases covered
|
||||
- 100% core logic coverage
|
||||
|
||||
**Documentation**:
|
||||
- Module README with examples
|
||||
- Inline code documentation
|
||||
- Demo script provided
|
||||
- Integration guide included
|
||||
|
||||
## Next Steps
|
||||
|
||||
Recommended enhancements:
|
||||
|
||||
1. **Multi-seed building**: Build trees from multiple seeds simultaneously
|
||||
2. **Graph visualization**: Export to DOT/Mermaid format
|
||||
3. **Incremental updates**: Update trees based on code changes
|
||||
4. **Custom scoring**: Pluggable scoring functions
|
||||
5. **Caching**: Cache frequently-accessed trees
|
||||
6. **Cross-language support**: Extend beyond Python (TypeScript, Java, etc.)
|
||||
|
||||
## Conclusion
|
||||
|
||||
The association tree implementation provides a robust foundation for LSP-based code relationship discovery in CodexLens. All core components are implemented, tested, and ready for integration into the hybrid search engine.
|
||||
|
||||
**Status**: ✅ Complete and tested
|
||||
**Files Modified**: 4
|
||||
**Files Created**: 7
|
||||
**Tests Added**: 9
|
||||
**All Tests Passing**: Yes
|
||||
@@ -1,245 +0,0 @@
|
||||
# Chain Search Implementation Summary
|
||||
|
||||
## Files Created
|
||||
|
||||
### 1. `D:\Claude_dms3\codex-lens\src\codexlens\search\__init__.py`
|
||||
Module initialization file exporting all public classes and functions:
|
||||
- `ChainSearchEngine`
|
||||
- `SearchOptions`
|
||||
- `SearchStats`
|
||||
- `ChainSearchResult`
|
||||
- `quick_search`
|
||||
|
||||
### 2. `D:\Claude_dms3\codex-lens\src\codexlens\search\chain_search.py`
|
||||
Complete implementation of the chain search engine (460+ lines) with:
|
||||
|
||||
#### Classes
|
||||
|
||||
**SearchOptions**
|
||||
- Configuration dataclass for search behavior
|
||||
- Controls depth, parallelism, result limits
|
||||
- Supports files-only and symbol search modes
|
||||
|
||||
**SearchStats**
|
||||
- Search execution statistics
|
||||
- Tracks directories searched, files matched, timing, errors
|
||||
|
||||
**ChainSearchResult**
|
||||
- Comprehensive search result container
|
||||
- Includes results, symbols, and execution statistics
|
||||
|
||||
**ChainSearchEngine**
|
||||
- Main parallel search engine
|
||||
- Thread-safe with ThreadPoolExecutor
|
||||
- Supports recursive directory traversal
|
||||
- Implements result aggregation and deduplication
|
||||
|
||||
#### Key Methods
|
||||
|
||||
**Public API:**
|
||||
- `search()` - Main search with full results
|
||||
- `search_files_only()` - Fast file path-only search
|
||||
- `search_symbols()` - Symbol search across hierarchy
|
||||
|
||||
**Internal Methods:**
|
||||
- `_find_start_index()` - Locate starting index for source path
|
||||
- `_collect_index_paths()` - Recursive index path collection via subdirs
|
||||
- `_search_parallel()` - Parallel ThreadPoolExecutor search
|
||||
- `_search_single_index()` - Single index search with error handling
|
||||
- `_merge_and_rank()` - Result deduplication and ranking
|
||||
- `_search_symbols_parallel()` - Parallel symbol search
|
||||
- `_search_symbols_single()` - Single index symbol search
|
||||
|
||||
**Convenience Function:**
|
||||
- `quick_search()` - One-line search with auto-initialization
|
||||
|
||||
## Implementation Features
|
||||
|
||||
### 1. Chain Traversal
|
||||
- Starts from source path, finds nearest index
|
||||
- Recursively collects subdirectory indexes via `subdirs` table
|
||||
- Supports depth limiting (-1 = unlimited, 0 = current only)
|
||||
- Prevents duplicate traversal with visited set
|
||||
|
||||
### 2. Parallel Execution
|
||||
- Uses ThreadPoolExecutor for concurrent searches
|
||||
- Configurable worker count (default: 8)
|
||||
- Error-tolerant: individual index failures don't block overall search
|
||||
- Collects results as futures complete
|
||||
|
||||
### 3. Result Processing
|
||||
- **Deduplication**: By file path, keeping highest score
|
||||
- **Ranking**: BM25 score descending
|
||||
- **Limiting**: Per-directory and total limits
|
||||
- **Statistics**: Comprehensive execution metrics
|
||||
|
||||
### 4. Search Modes
|
||||
- **Full search**: Results with excerpts and scores
|
||||
- **Files-only**: Fast path-only mode
|
||||
- **Symbol search**: Cross-directory symbol lookup
|
||||
|
||||
### 5. Error Handling
|
||||
- Graceful degradation on index errors
|
||||
- Missing index warnings logged
|
||||
- Error tracking in SearchStats
|
||||
- Non-blocking failure mode
|
||||
|
||||
## Search Flow Example
|
||||
|
||||
```
|
||||
search("auth", path="D:/project/src", depth=-1)
|
||||
|
|
||||
v
|
||||
[1] _find_start_index
|
||||
registry.find_index_path("D:/project/src")
|
||||
-> ~/.codexlens/indexes/D/project/src/_index.db
|
||||
|
|
||||
v
|
||||
[2] _collect_index_paths (chain traversal)
|
||||
src/_index.db
|
||||
+-- subdirs: [api, utils]
|
||||
|
|
||||
+-- api/_index.db
|
||||
| +-- subdirs: []
|
||||
|
|
||||
+-- utils/_index.db
|
||||
+-- subdirs: []
|
||||
|
||||
Result: [src/_index.db, api/_index.db, utils/_index.db]
|
||||
|
|
||||
v
|
||||
[3] _search_parallel (ThreadPoolExecutor)
|
||||
Thread1: src/ -> FTS search
|
||||
Thread2: api/ -> FTS search
|
||||
Thread3: utils/ -> FTS search
|
||||
|
|
||||
v
|
||||
[4] _merge_and_rank
|
||||
- Deduplicate by path
|
||||
- Sort by score descending
|
||||
- Apply total_limit
|
||||
|
|
||||
v
|
||||
ChainSearchResult
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Test File: `D:\Claude_dms3\codex-lens\test_chain_search.py`
|
||||
Comprehensive test suite with four test functions:
|
||||
|
||||
1. **test_basic_search()** - Full search with all options
|
||||
2. **test_quick_search()** - Convenience function test
|
||||
3. **test_symbol_search()** - Symbol search across hierarchy
|
||||
4. **test_files_only_search()** - Fast file-only mode
|
||||
|
||||
### Test Results
|
||||
- All imports successful
|
||||
- All tests pass without errors
|
||||
- Returns empty results (expected - no indexes built yet)
|
||||
- Logging shows proper "No index found" warnings
|
||||
- No crashes or exceptions
|
||||
|
||||
## Integration Points
|
||||
|
||||
### Dependencies
|
||||
- `codexlens.entities`: SearchResult, Symbol
|
||||
- `codexlens.storage.registry`: RegistryStore, DirMapping
|
||||
- `codexlens.storage.dir_index`: DirIndexStore, SubdirLink
|
||||
- `codexlens.storage.path_mapper`: PathMapper
|
||||
|
||||
### Thread Safety
|
||||
- Uses ThreadPoolExecutor for parallel searches
|
||||
- Each thread gets own DirIndexStore connection
|
||||
- SQLite WAL mode supports concurrent reads
|
||||
- Registry uses thread-local connections
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Search
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.search import ChainSearchEngine
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
result = engine.search("authentication", Path("D:/project/src"))
|
||||
print(f"Found {len(result.results)} matches in {result.stats.time_ms:.2f}ms")
|
||||
```
|
||||
|
||||
### Quick Search
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.search import quick_search
|
||||
|
||||
results = quick_search("TODO", Path("D:/project"), depth=2)
|
||||
for r in results[:5]:
|
||||
print(f"{r.path}: {r.score:.2f}")
|
||||
```
|
||||
|
||||
### Symbol Search
|
||||
```python
|
||||
symbols = engine.search_symbols("init", Path("D:/project"), kind="function")
|
||||
for sym in symbols:
|
||||
print(f"{sym.name} - lines {sym.range[0]}-{sym.range[1]}")
|
||||
```
|
||||
|
||||
### Files-Only Mode
|
||||
```python
|
||||
paths = engine.search_files_only("config", Path("D:/project"))
|
||||
print(f"Files with 'config': {len(paths)}")
|
||||
```
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Strengths
|
||||
- **Parallel execution**: Multiple indexes searched concurrently
|
||||
- **Lazy traversal**: Only loads needed subdirectories
|
||||
- **Memory efficient**: Streaming results, no full tree in memory
|
||||
- **Depth limiting**: Can restrict search scope
|
||||
|
||||
### Considerations
|
||||
- **First search slower**: Needs to traverse subdir links
|
||||
- **Many small dirs**: Overhead from thread pool
|
||||
- **Deep hierarchies**: Depth=-1 may be slow on large trees
|
||||
|
||||
### Optimization Tips
|
||||
- Use `depth` parameter to limit scope
|
||||
- Use `limit_per_dir` to reduce per-index overhead
|
||||
- Use `files_only=True` when excerpts not needed
|
||||
- Reuse ChainSearchEngine instance for multiple searches
|
||||
|
||||
## Code Quality
|
||||
|
||||
### Standards Met
|
||||
- **Type annotations**: Full typing on all methods
|
||||
- **Docstrings**: Complete with examples and parameter docs
|
||||
- **Error handling**: Graceful degradation, no crashes
|
||||
- **ASCII-only**: Windows GBK compatible
|
||||
- **No debug spam**: Clean logging at appropriate levels
|
||||
- **Thread safety**: Proper locking and pooling
|
||||
|
||||
### Design Patterns
|
||||
- **Dataclasses**: Clean configuration and result objects
|
||||
- **Context managers**: Proper resource cleanup
|
||||
- **Dependency injection**: Registry and mapper passed in
|
||||
- **Builder pattern**: SearchOptions for configuration
|
||||
- **Template method**: _search_single_index extensible
|
||||
|
||||
## Status: Complete and Tested
|
||||
|
||||
All requirements met:
|
||||
- [x] Parallel search with ThreadPoolExecutor
|
||||
- [x] Chain traversal via subdirs links
|
||||
- [x] Depth limiting
|
||||
- [x] Error tolerance
|
||||
- [x] Search statistics
|
||||
- [x] Complete docstrings and type hints
|
||||
- [x] Test suite passes
|
||||
- [x] ASCII-only output (GBK compatible)
|
||||
- [x] Integration with existing codebase
|
||||
@@ -1,41 +0,0 @@
|
||||
# CodexLens – Optimization Plan Changelog
|
||||
|
||||
This changelog tracks the **CodexLens optimization plan** milestones (not the Python package version in `pyproject.toml`).
|
||||
|
||||
## v1.0 (Optimization) – 2025-12-26
|
||||
|
||||
### Optimizations
|
||||
|
||||
1. **P0: Context-aware hybrid chunking**
|
||||
- Docstrings are extracted into dedicated chunks and excluded from code chunks.
|
||||
- Docstring chunks include `parent_symbol` metadata when the docstring belongs to a function/class/method.
|
||||
- Sliding-window chunk boundaries are deterministic for identical input.
|
||||
|
||||
2. **P1: Adaptive RRF weights (QueryIntent)**
|
||||
- Query intent is classified as `keyword` / `semantic` / `mixed`.
|
||||
- RRF weights adapt to intent:
|
||||
- `keyword`: exact-heavy (favors lexical matches)
|
||||
- `semantic`: vector-heavy (favors semantic matches)
|
||||
- `mixed`: keeps base/default weights
|
||||
|
||||
3. **P2: Symbol boost**
|
||||
- Fused results with an explicit symbol match (`symbol_name`) receive a multiplicative boost (default `1.5x`).
|
||||
|
||||
4. **P2: Embedding-based re-ranking (optional)**
|
||||
- A second-stage ranker can reorder top results by semantic similarity.
|
||||
- Re-ranking runs only when `Config.enable_reranking=True`.
|
||||
|
||||
5. **P3: Global symbol index (incremental + fast path)**
|
||||
- `GlobalSymbolIndex` stores project-wide symbols in one SQLite DB for fast symbol lookups.
|
||||
- `ChainSearchEngine.search_symbols()` uses the global index fast path when enabled.
|
||||
|
||||
### Migration Notes
|
||||
- **Reindexing (recommended)**: deterministic chunking and docstring metadata affect stored chunks. For best results, regenerate indexes/embeddings after upgrading:
|
||||
- Rebuild indexes and/or re-run embedding generation for existing projects.
|
||||
- **New config flags**:
|
||||
- `Config.enable_reranking` (default `False`)
|
||||
- `Config.reranking_top_k` (default `50`)
|
||||
- `Config.symbol_boost_factor` (default `1.5`)
|
||||
- `Config.global_symbol_index_enabled` (default `True`)
|
||||
- **Breaking changes**: none (behavioral improvements only).
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
# Dependency Management
|
||||
|
||||
This project uses setuptools with `pyproject.toml` for dependency management.
|
||||
|
||||
## Locking Dependencies
|
||||
|
||||
To generate a fully pinned `requirements.txt` from `requirements.in`:
|
||||
|
||||
```bash
|
||||
# Install pip-tools
|
||||
pip install pip-tools
|
||||
|
||||
# Compile requirements
|
||||
pip-compile requirements.in --output-file=requirements.txt
|
||||
|
||||
# To upgrade dependencies
|
||||
pip-compile --upgrade requirements.in --output-file=requirements.txt
|
||||
```
|
||||
|
||||
## Version Constraints
|
||||
|
||||
This project uses **pessimistic versioning** (`~=`) for dependency specifications per PEP 440:
|
||||
|
||||
- `typer~=0.9.0` means: `>=0.9.0, ==0.9.*`
|
||||
- Allows bugfix updates (0.9.0, 0.9.1, 0.9.2) but not feature/minor updates (0.10.0)
|
||||
|
||||
This provides stability while allowing automatic patch updates.
|
||||
|
||||
## Security Scanning
|
||||
|
||||
The project includes automated security scanning via GitHub Actions:
|
||||
- Runs on every push to main branch
|
||||
- Runs weekly (Sundays at 00:00 UTC)
|
||||
- Can be triggered manually
|
||||
|
||||
The scan uses:
|
||||
- `pip-audit`: Checks for known vulnerabilities in dependencies
|
||||
- `bandit`: Security linter for Python code
|
||||
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 CodexLens Contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -1,109 +0,0 @@
|
||||
# CodexLens
|
||||
|
||||
CodexLens is a multi-modal code analysis platform designed to provide comprehensive code understanding and analysis capabilities.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-language Support**: Analyze code in Python, JavaScript, TypeScript and more using Tree-sitter parsers
|
||||
- **Semantic Search**: Find relevant code snippets using semantic understanding with fastembed and HNSWLIB
|
||||
- **Code Parsing**: Advanced code structure parsing with tree-sitter
|
||||
- **Flexible Architecture**: Modular design for easy extension and customization
|
||||
|
||||
## Installation
|
||||
|
||||
### Basic Installation
|
||||
|
||||
```bash
|
||||
pip install codex-lens
|
||||
```
|
||||
|
||||
### With Semantic Search
|
||||
|
||||
```bash
|
||||
pip install codex-lens[semantic]
|
||||
```
|
||||
|
||||
### With GPU Acceleration (NVIDIA CUDA)
|
||||
|
||||
```bash
|
||||
pip install codex-lens[semantic-gpu]
|
||||
```
|
||||
|
||||
### With DirectML (Windows - NVIDIA/AMD/Intel)
|
||||
|
||||
```bash
|
||||
pip install codex-lens[semantic-directml]
|
||||
```
|
||||
|
||||
### With All Optional Features
|
||||
|
||||
```bash
|
||||
pip install codex-lens[full]
|
||||
```
|
||||
|
||||
### Local ONNX Reranker Bootstrap
|
||||
|
||||
Use the pinned bootstrap flow when you want the local-only reranker backend in an
|
||||
existing CodexLens virtual environment without asking pip to resolve the whole
|
||||
project extras set at once.
|
||||
|
||||
1. Start from the CodexLens repo root and create or activate the project venv.
|
||||
2. Review the pinned install manifest in `scripts/requirements-reranker-local.txt`.
|
||||
3. Render the deterministic setup plan:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --dry-run
|
||||
```
|
||||
|
||||
The bootstrap script always targets the selected venv Python, installs the local
|
||||
ONNX reranker stack in a fixed order, and keeps the package set pinned to the
|
||||
validated Python 3.13-compatible combination:
|
||||
|
||||
- `numpy==2.4.0`
|
||||
- `onnxruntime==1.23.2`
|
||||
- `huggingface-hub==0.36.2`
|
||||
- `transformers==4.53.3`
|
||||
- `optimum[onnxruntime]==2.1.0`
|
||||
|
||||
When you are ready to apply it to the CodexLens venv, use:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply
|
||||
```
|
||||
|
||||
To pre-download the default local reranker model (`Xenova/ms-marco-MiniLM-L-6-v2`)
|
||||
into the repo-local Hugging Face cache, use:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply --download-model
|
||||
```
|
||||
|
||||
The dry-run plan also prints the equivalent explicit model download command. On
|
||||
Windows PowerShell with the default repo venv, it looks like:
|
||||
|
||||
```bash
|
||||
.venv/Scripts/hf.exe download Xenova/ms-marco-MiniLM-L-6-v2 --local-dir .cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2
|
||||
```
|
||||
|
||||
After installation, probe the backend from the same venv:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply --probe
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python >= 3.10
|
||||
- See `pyproject.toml` for detailed dependency list
|
||||
|
||||
## Development
|
||||
|
||||
This project uses setuptools for building and packaging.
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
||||
|
||||
## Authors
|
||||
|
||||
CodexLens Contributors
|
||||
@@ -1,83 +0,0 @@
|
||||
# Semantic Search Integration
|
||||
|
||||
## Overview
|
||||
The ChainSearchEngine now supports semantic keyword search in addition to FTS5 full-text search.
|
||||
|
||||
## Usage
|
||||
|
||||
### Enable Semantic Search
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
# Initialize
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
# Create options with semantic search enabled
|
||||
options = SearchOptions(
|
||||
include_semantic=True, # Enable semantic keyword search
|
||||
total_limit=50
|
||||
)
|
||||
|
||||
# Execute search
|
||||
result = engine.search("authentication", Path("./src"), options)
|
||||
|
||||
# Results include both FTS and semantic matches
|
||||
for r in result.results:
|
||||
print(f"{r.path}: {r.score:.2f} - {r.excerpt}")
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **FTS Search**: Traditional full-text search using SQLite FTS5
|
||||
2. **Semantic Search**: Searches the `semantic_metadata.keywords` field
|
||||
3. **Result Merging**: Semantic results are added with 0.8x weight
|
||||
- FTS results: BM25 score from SQLite
|
||||
- Semantic results: Base score of 10.0 * 0.8 = 8.0
|
||||
4. **Deduplication**: `_merge_and_rank()` deduplicates by path, keeping highest score
|
||||
|
||||
### Result Format
|
||||
|
||||
- **FTS results**: Regular excerpt from matched content
|
||||
- **Semantic results**: `Keywords: keyword1, keyword2, keyword3, ...`
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Files must have semantic metadata generated via:
|
||||
|
||||
```bash
|
||||
codex-lens enhance . --tool gemini
|
||||
```
|
||||
|
||||
This uses CCW CLI to generate summaries, keywords, and purpose descriptions.
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Changes Made
|
||||
|
||||
1. **SearchOptions**: Added `include_semantic: bool = False` parameter
|
||||
2. **_search_parallel()**: Passes `include_semantic` to worker threads
|
||||
3. **_search_single_index()**:
|
||||
- Accepts `include_semantic` parameter
|
||||
- Calls `DirIndexStore.search_semantic_keywords()` when enabled
|
||||
- Converts semantic matches to `SearchResult` objects
|
||||
- Applies 0.8x weight to semantic scores
|
||||
|
||||
### Score Weighting
|
||||
|
||||
```python
|
||||
# FTS result (from BM25)
|
||||
SearchResult(path="...", score=12.5, excerpt="...")
|
||||
|
||||
# Semantic result (fixed weighted score)
|
||||
SearchResult(path="...", score=8.0, excerpt="Keywords: ...")
|
||||
```
|
||||
|
||||
The 0.8x weight ensures semantic matches rank slightly lower than direct FTS matches
|
||||
but still appear in relevant results.
|
||||
@@ -1,16 +0,0 @@
|
||||
{"query":"executeHybridMode dense_rerank semantic smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-semantic-routing","notes":"CCW semantic mode delegates to CodexLens dense_rerank."}
|
||||
{"query":"parse CodexLens JSON output strip ANSI smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-json-fallback","notes":"Covers JSON/plain-text fallback handling for CodexLens output."}
|
||||
{"query":"smart_search init embed search action schema","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-action-schema","notes":"Find the Zod schema that defines init/embed/search actions."}
|
||||
{"query":"auto init missing job dedupe smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-auto-init","notes":"Targets background init/embed warmup and dedupe state."}
|
||||
{"query":"smart_search exact mode fallback to CodexLens fts","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-exact-fallback","notes":"Tracks the exact-mode fallback path into CodexLens FTS."}
|
||||
{"query":"smart_search settings snapshot embedding backend reranker backend staged stage2 mode","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-config-snapshot","notes":"Reads local config snapshot for embedding/reranker/staged pipeline settings."}
|
||||
{"query":"embedding backend fastembed local litellm api config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-embedding-config","notes":"Local-only benchmark should resolve to fastembed defaults."}
|
||||
{"query":"reranker backend onnx api legacy configuration","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-reranker-config","notes":"Covers both config dataclass fields and env overrides."}
|
||||
{"query":"staged stage2 mode precomputed realtime static_global_graph","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-stage2-config","notes":"Benchmark matrix should exercise the three supported stage2 modes."}
|
||||
{"query":"enable staged rerank stage 4 config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-stage4-rerank","notes":"Stage 4 rerank flag needs to stay enabled for local benchmarks."}
|
||||
{"query":"cascade_search dense_rerank staged pipeline ChainSearchEngine","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-cascade","notes":"Baseline query for the central retrieval engine."}
|
||||
{"query":"realtime LSP expand stage2 search pipeline","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-realtime","notes":"Targets realtime stage2 expansion logic."}
|
||||
{"query":"static global graph stage2 expansion implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-static","notes":"Targets static_global_graph stage2 expansion logic."}
|
||||
{"query":"cross encoder rerank stage 4 implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-rerank","notes":"Relevant for dense_rerank and staged rerank latency comparisons."}
|
||||
{"query":"get_reranker factory onnx backend selection","relevant_paths":["codex-lens/src/codexlens/semantic/reranker/factory.py"],"intent":"reranker-factory","notes":"Keeps the benchmark aligned with local ONNX reranker selection."}
|
||||
{"query":"EMBEDDING_BACKEND and RERANKER_BACKEND environment variables","relevant_paths":["codex-lens/src/codexlens/env_config.py"],"intent":"env-overrides","notes":"Covers CCW/CodexLens local-only environment overrides."}
|
||||
@@ -1,33 +0,0 @@
|
||||
{"query":"class StandaloneLspManager","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
|
||||
{"query":"def _open_document","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
|
||||
{"query":"def _read_message","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
|
||||
{"query":"how does textDocument/didOpen work","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
|
||||
{"query":"class LspBridge","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
|
||||
{"query":"def get_document_symbols","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
|
||||
{"query":"class KeepAliveLspBridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
|
||||
{"query":"LSP keepalive bridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
|
||||
{"query":"class LspGraphBuilder","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
|
||||
{"query":"def build_from_seeds","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
|
||||
{"query":"def _stage2_realtime_lsp_expand","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"def _stage3_cluster_prune","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"def _cross_encoder_rerank","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"def dense_rerank_cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"def cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"def _find_nearest_binary_mmap_root","relevant_paths":["codexlens/search/chain_search.py"]}
|
||||
{"query":"class BinarySearcher","relevant_paths":["codexlens/search/binary_searcher.py"]}
|
||||
{"query":"class GraphExpander","relevant_paths":["codexlens/search/graph_expander.py"]}
|
||||
{"query":"def cross_encoder_rerank","relevant_paths":["codexlens/search/ranking.py"]}
|
||||
{"query":"def group_similar_results","relevant_paths":["codexlens/search/ranking.py"]}
|
||||
{"query":"class ConfigError","relevant_paths":["codexlens/errors.py"]}
|
||||
{"query":"def load_settings","relevant_paths":["codexlens/config.py"]}
|
||||
{"query":"BINARY_VECTORS_MMAP_NAME","relevant_paths":["codexlens/config.py"]}
|
||||
{"query":"STAGED_CLUSTERING_STRATEGY","relevant_paths":["codexlens/config.py","codexlens/env_config.py"]}
|
||||
{"query":"def apply_workspace_env","relevant_paths":["codexlens/env_config.py"]}
|
||||
{"query":"def generate_env_example","relevant_paths":["codexlens/env_config.py"]}
|
||||
{"query":"def get_reranker","relevant_paths":["codexlens/semantic/reranker/factory.py"]}
|
||||
{"query":"class APIReranker","relevant_paths":["codexlens/semantic/reranker/api_reranker.py"]}
|
||||
{"query":"class RegistryStore","relevant_paths":["codexlens/storage/registry.py"]}
|
||||
{"query":"class PathMapper","relevant_paths":["codexlens/storage/path_mapper.py"]}
|
||||
{"query":"def lsp_status","relevant_paths":["codexlens/cli/commands.py"]}
|
||||
{"query":"graph_neighbors migration","relevant_paths":["codexlens/storage/migrations/migration_007_add_graph_neighbors.py"]}
|
||||
{"query":"def get_model_config","relevant_paths":["codexlens/semantic/vector_store.py"]}
|
||||
@@ -1,245 +0,0 @@
|
||||
"""Analyze hybrid search methods contribution."""
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
from codexlens.search.ranking import (
|
||||
reciprocal_rank_fusion,
|
||||
cross_encoder_rerank,
|
||||
DEFAULT_WEIGHTS,
|
||||
)
|
||||
|
||||
# Use index with most data
|
||||
index_path = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens\src\codexlens\storage\_index.db")
|
||||
|
||||
print("=" * 60)
|
||||
print("1. STORAGE ARCHITECTURE ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
# Analyze storage
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
||||
)
|
||||
tables = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
print("\nTable Overview:")
|
||||
for table in tables:
|
||||
try:
|
||||
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
|
||||
if count > 0:
|
||||
print(f" {table}: {count} rows")
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n--- Conflict Analysis ---")
|
||||
|
||||
chunks_count = 0
|
||||
semantic_count = 0
|
||||
|
||||
if "chunks" in tables:
|
||||
chunks_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
||||
if "semantic_chunks" in tables:
|
||||
semantic_count = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()[0]
|
||||
|
||||
print(f" chunks table: {chunks_count} rows")
|
||||
print(f" semantic_chunks table: {semantic_count} rows")
|
||||
|
||||
if semantic_count > 0:
|
||||
col_info = conn.execute("PRAGMA table_info(semantic_chunks)").fetchall()
|
||||
col_names = [c[1] for c in col_info]
|
||||
|
||||
print(f"\n semantic_chunks columns: {col_names}")
|
||||
|
||||
for col in ["embedding", "embedding_binary", "embedding_dense"]:
|
||||
if col in col_names:
|
||||
null_count = conn.execute(
|
||||
f"SELECT COUNT(*) FROM semantic_chunks WHERE {col} IS NULL"
|
||||
).fetchone()[0]
|
||||
non_null = semantic_count - null_count
|
||||
print(f" {col}: {non_null}/{semantic_count} non-null")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("2. METHOD CONTRIBUTION ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
queries = [
|
||||
"database connection",
|
||||
"create table",
|
||||
"sqlite store",
|
||||
"migration",
|
||||
"search chunks",
|
||||
]
|
||||
|
||||
results_summary = {
|
||||
"fts_exact": [],
|
||||
"fts_fuzzy": [],
|
||||
"vector": [],
|
||||
}
|
||||
|
||||
for query in queries:
|
||||
print(f"\nQuery: '{query}'")
|
||||
|
||||
# FTS Exact
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
results_summary["fts_exact"].append({"count": len(results), "latency": latency})
|
||||
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||
top_score = results[0].score if results else 0
|
||||
print(f" FTS Exact: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||
except Exception as e:
|
||||
print(f" FTS Exact: ERROR - {e}")
|
||||
|
||||
# FTS Fuzzy
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
results_summary["fts_fuzzy"].append({"count": len(results), "latency": latency})
|
||||
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||
top_score = results[0].score if results else 0
|
||||
print(f" FTS Fuzzy: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||
except Exception as e:
|
||||
print(f" FTS Fuzzy: ERROR - {e}")
|
||||
|
||||
# Vector
|
||||
try:
|
||||
engine = HybridSearchEngine()
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": False,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
results = engine.search(index_path, query, limit=10, enable_vector=True, pure_vector=True)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
results_summary["vector"].append({"count": len(results), "latency": latency})
|
||||
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||
top_score = results[0].score if results else 0
|
||||
print(f" Vector: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||
except Exception as e:
|
||||
print(f" Vector: ERROR - {e}")
|
||||
|
||||
print("\n--- Summary ---")
|
||||
for method, data in results_summary.items():
|
||||
if data:
|
||||
avg_count = sum(d["count"] for d in data) / len(data)
|
||||
avg_latency = sum(d["latency"] for d in data) / len(data)
|
||||
print(f"{method}: avg {avg_count:.1f} results, {avg_latency:.1f}ms")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("3. FTS + RERANK FUSION EXPERIMENT")
|
||||
print("=" * 60)
|
||||
|
||||
# Initialize reranker
|
||||
reranker = None
|
||||
try:
|
||||
from codexlens.semantic.reranker import get_reranker, check_reranker_available
|
||||
ok, _ = check_reranker_available("onnx")
|
||||
if ok:
|
||||
reranker = get_reranker(backend="onnx", use_gpu=True)
|
||||
print("\nReranker loaded: ONNX backend")
|
||||
except Exception as e:
|
||||
print(f"\nReranker unavailable: {e}")
|
||||
|
||||
test_queries = ["database connection", "create table migration"]
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\nQuery: '{query}'")
|
||||
|
||||
# Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF)
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
standard_results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
|
||||
standard_latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
print(f" Standard FTS RRF: {len(standard_results)} results, {standard_latency:.1f}ms")
|
||||
for i, r in enumerate(standard_results[:3]):
|
||||
print(f" {i+1}. {r.path.split(chr(92))[-1]} (score: {r.score:.4f})")
|
||||
except Exception as e:
|
||||
print(f" Standard FTS RRF: ERROR - {e}")
|
||||
standard_results = []
|
||||
|
||||
# Strategy 2: FTS + CrossEncoder Rerank
|
||||
if reranker and standard_results:
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
reranked_results = cross_encoder_rerank(query, standard_results, reranker, top_k=10)
|
||||
rerank_latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
print(f" FTS + Rerank: {len(reranked_results)} results, {rerank_latency:.1f}ms (rerank only)")
|
||||
for i, r in enumerate(reranked_results[:3]):
|
||||
ce_score = r.metadata.get("cross_encoder_prob", r.score)
|
||||
print(f" {i+1}. {r.path.split(chr(92))[-1]} (CE prob: {ce_score:.4f})")
|
||||
|
||||
# Compare rankings
|
||||
standard_order = [r.path.split("\\")[-1] for r in standard_results[:5]]
|
||||
reranked_order = [r.path.split("\\")[-1] for r in reranked_results[:5]]
|
||||
|
||||
if standard_order != reranked_order:
|
||||
print(f" Ranking changed!")
|
||||
print(f" Before: {standard_order}")
|
||||
print(f" After: {reranked_order}")
|
||||
else:
|
||||
print(f" Ranking unchanged")
|
||||
|
||||
except Exception as e:
|
||||
print(f" FTS + Rerank: ERROR - {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("CONCLUSIONS")
|
||||
print("=" * 60)
|
||||
print("""
|
||||
1. Storage Architecture:
|
||||
- semantic_chunks: Used by cascade-index (binary+dense vectors)
|
||||
- chunks: Used by legacy SQLiteStore (currently empty in this index)
|
||||
- files_fts_*: Used by FTS exact/fuzzy search
|
||||
|
||||
CONFLICT: binary_cascade_search reads from semantic_chunks,
|
||||
but standard FTS reads from files table. These are SEPARATE paths.
|
||||
|
||||
2. Method Contributions:
|
||||
- FTS: Fast but limited to keyword matching
|
||||
- Vector: Semantic understanding but requires embeddings
|
||||
|
||||
3. FTS + Rerank Fusion:
|
||||
- CrossEncoder reranking can improve precision
|
||||
- Adds ~100-200ms latency per query
|
||||
- Most effective when initial FTS recall is good
|
||||
""")
|
||||
@@ -1,209 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Micro-benchmark for BinaryANNIndex search performance.
|
||||
|
||||
Measures the actual speedup of vectorized Hamming distance computation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def old_search_implementation(query_arr: np.ndarray, vectors: dict, id_list: list, top_k: int):
|
||||
"""Original O(N) loop-based implementation for comparison."""
|
||||
packed_dim = len(query_arr)
|
||||
distances = []
|
||||
|
||||
for vec_id in id_list:
|
||||
vec = vectors[vec_id]
|
||||
vec_arr = np.frombuffer(vec, dtype=np.uint8)
|
||||
xor = np.bitwise_xor(query_arr, vec_arr)
|
||||
dist = int(np.unpackbits(xor).sum())
|
||||
distances.append((vec_id, dist))
|
||||
|
||||
distances.sort(key=lambda x: x[1])
|
||||
top_results = distances[:top_k]
|
||||
ids = [r[0] for r in top_results]
|
||||
dists = [r[1] for r in top_results]
|
||||
|
||||
return ids, dists
|
||||
|
||||
|
||||
def new_search_implementation(query_arr: np.ndarray, vectors_matrix: np.ndarray, ids_array: np.ndarray, top_k: int):
|
||||
"""Optimized vectorized implementation."""
|
||||
# Broadcast XOR
|
||||
xor_result = np.bitwise_xor(query_arr, vectors_matrix)
|
||||
|
||||
# Vectorized popcount using lookup table
|
||||
popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
bit_counts = popcount_lut[xor_result]
|
||||
|
||||
# Sum across packed bytes
|
||||
distances = bit_counts.sum(axis=1)
|
||||
|
||||
# Get top-k using argpartition
|
||||
n_vectors = len(distances)
|
||||
k = min(top_k, n_vectors)
|
||||
|
||||
if k == n_vectors:
|
||||
sorted_indices = np.argsort(distances)
|
||||
else:
|
||||
partition_indices = np.argpartition(distances, k)[:k]
|
||||
top_k_distances = distances[partition_indices]
|
||||
sorted_order = np.argsort(top_k_distances)
|
||||
sorted_indices = partition_indices[sorted_order]
|
||||
|
||||
result_ids = ids_array[sorted_indices].tolist()
|
||||
result_dists = distances[sorted_indices].tolist()
|
||||
|
||||
return result_ids, result_dists
|
||||
|
||||
|
||||
def run_benchmark(n_vectors: int, dim: int = 256, top_k: int = 100, n_iterations: int = 50):
|
||||
"""Run benchmark comparing old and new implementations."""
|
||||
packed_dim = dim // 8 # 32 bytes for 256-bit
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Binary Search Micro-Benchmark")
|
||||
print(f"{'='*60}")
|
||||
print(f"Vectors: {n_vectors}")
|
||||
print(f"Dimension: {dim} bits ({packed_dim} bytes packed)")
|
||||
print(f"Top-K: {top_k}")
|
||||
print(f"Iterations: {n_iterations}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Generate random binary vectors
|
||||
print("Generating test data...")
|
||||
vectors_dict = {}
|
||||
id_list = []
|
||||
|
||||
for i in range(n_vectors):
|
||||
vec_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
|
||||
vectors_dict[i] = vec_bytes
|
||||
id_list.append(i)
|
||||
|
||||
# Build matrix for vectorized search
|
||||
vectors_matrix = np.empty((n_vectors, packed_dim), dtype=np.uint8)
|
||||
ids_array = np.array(id_list, dtype=np.int64)
|
||||
|
||||
for i, vec_id in enumerate(id_list):
|
||||
vec_bytes = vectors_dict[vec_id]
|
||||
vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8)
|
||||
|
||||
# Generate random query
|
||||
query_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
|
||||
query_arr = np.frombuffer(query_bytes, dtype=np.uint8)
|
||||
|
||||
# Warmup
|
||||
print("Running warmup...")
|
||||
for _ in range(3):
|
||||
old_search_implementation(query_arr, vectors_dict, id_list, top_k)
|
||||
new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
|
||||
|
||||
# Benchmark old implementation
|
||||
print("Benchmarking old implementation...")
|
||||
old_times = []
|
||||
for _ in range(n_iterations):
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
old_ids, old_dists = old_search_implementation(query_arr, vectors_dict, id_list, top_k)
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
old_times.append(elapsed)
|
||||
|
||||
# Benchmark new implementation
|
||||
print("Benchmarking new implementation...")
|
||||
new_times = []
|
||||
for _ in range(n_iterations):
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
new_ids, new_dists = new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
new_times.append(elapsed)
|
||||
|
||||
# Verify correctness
|
||||
print("\nVerifying correctness...")
|
||||
# Check that distances are correct (IDs may differ for ties)
|
||||
if old_dists == new_dists:
|
||||
print("Distances match! (IDs may differ for ties)")
|
||||
else:
|
||||
# Check if difference is just in tie-breaking
|
||||
old_dist_set = set(old_dists)
|
||||
new_dist_set = set(new_dists)
|
||||
if old_dist_set == new_dist_set:
|
||||
print("Distances equivalent (tie-breaking differs, which is acceptable)")
|
||||
else:
|
||||
print("WARNING: Distance distributions differ!")
|
||||
print(f" Old dists (first 5): {old_dists[:5]}")
|
||||
print(f" New dists (first 5): {new_dists[:5]}")
|
||||
|
||||
# Calculate statistics
|
||||
old_avg = statistics.mean(old_times)
|
||||
old_std = statistics.stdev(old_times) if len(old_times) > 1 else 0
|
||||
new_avg = statistics.mean(new_times)
|
||||
new_std = statistics.stdev(new_times) if len(new_times) > 1 else 0
|
||||
|
||||
speedup = old_avg / new_avg if new_avg > 0 else 0
|
||||
|
||||
# Print results
|
||||
print(f"\n{'='*60}")
|
||||
print("RESULTS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'Metric':<25} {'Old (loop)':>15} {'New (vectorized)':>18}")
|
||||
print(f"{'-'*25} {'-'*15} {'-'*18}")
|
||||
print(f"{'Avg Latency (ms)':<25} {old_avg:>15.3f} {new_avg:>18.3f}")
|
||||
print(f"{'Std Dev (ms)':<25} {old_std:>15.3f} {new_std:>18.3f}")
|
||||
print(f"{'Min Latency (ms)':<25} {min(old_times):>15.3f} {min(new_times):>18.3f}")
|
||||
print(f"{'Max Latency (ms)':<25} {max(old_times):>15.3f} {max(new_times):>18.3f}")
|
||||
print(f"{'P50 (ms)':<25} {sorted(old_times)[len(old_times)//2]:>15.3f} {sorted(new_times)[len(new_times)//2]:>18.3f}")
|
||||
print(f"\n{'Speedup:':<25} {speedup:>15.2f}x")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return {
|
||||
"n_vectors": n_vectors,
|
||||
"dim": dim,
|
||||
"top_k": top_k,
|
||||
"old_avg_ms": old_avg,
|
||||
"new_avg_ms": new_avg,
|
||||
"speedup": speedup,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print(" BINARY SEARCH OPTIMIZATION MICRO-BENCHMARK")
|
||||
print("="*70)
|
||||
|
||||
# Test different vector counts
|
||||
results = []
|
||||
|
||||
for n_vectors in [1000, 5000, 10000, 50000]:
|
||||
result = run_benchmark(
|
||||
n_vectors=n_vectors,
|
||||
dim=256,
|
||||
top_k=100,
|
||||
n_iterations=20,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print(" SUMMARY")
|
||||
print("="*70)
|
||||
print(f"{'N Vectors':<12} {'Old (ms)':<12} {'New (ms)':<12} {'Speedup':>10}")
|
||||
print("-"*50)
|
||||
for r in results:
|
||||
print(f"{r['n_vectors']:<12} {r['old_avg_ms']:<12.3f} {r['new_avg_ms']:<12.3f} {r['speedup']:>10.2f}x")
|
||||
print("="*70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,402 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Benchmark script for comparing cascade search strategies.
|
||||
|
||||
Compares:
|
||||
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
|
||||
- hybrid: FTS+Vector coarse ranking + CrossEncoder fine ranking
|
||||
|
||||
Usage:
|
||||
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.config import Config
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""Result from a single benchmark run."""
|
||||
strategy: str
|
||||
query: str
|
||||
latency_ms: float
|
||||
num_results: int
|
||||
top_result: Optional[str]
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkSummary:
|
||||
"""Aggregated benchmark statistics."""
|
||||
strategy: str
|
||||
total_queries: int
|
||||
successful_queries: int
|
||||
avg_latency_ms: float
|
||||
min_latency_ms: float
|
||||
max_latency_ms: float
|
||||
p50_latency_ms: float
|
||||
p95_latency_ms: float
|
||||
p99_latency_ms: float
|
||||
avg_results: float
|
||||
errors: List[str]
|
||||
|
||||
|
||||
# Default test queries covering different scenarios
|
||||
DEFAULT_QUERIES = [
|
||||
# Code patterns
|
||||
"def search",
|
||||
"class Engine",
|
||||
"import numpy",
|
||||
"async def",
|
||||
"raise ValueError",
|
||||
# Semantic queries
|
||||
"how to parse json",
|
||||
"database connection",
|
||||
"error handling",
|
||||
"authentication logic",
|
||||
"file read write",
|
||||
# Technical terms
|
||||
"embedding vector",
|
||||
"cosine similarity",
|
||||
"binary quantization",
|
||||
"hamming distance",
|
||||
"reranking",
|
||||
]
|
||||
|
||||
|
||||
def percentile(data: List[float], p: float) -> float:
|
||||
"""Calculate percentile of sorted data."""
|
||||
if not data:
|
||||
return 0.0
|
||||
sorted_data = sorted(data)
|
||||
k = (len(sorted_data) - 1) * (p / 100)
|
||||
f = int(k)
|
||||
c = f + 1 if f + 1 < len(sorted_data) else f
|
||||
return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
|
||||
|
||||
|
||||
def run_single_benchmark(
|
||||
engine: ChainSearchEngine,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
strategy: str,
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> BenchmarkResult:
|
||||
"""Run a single benchmark query."""
|
||||
gc.collect()
|
||||
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=10,
|
||||
coarse_k=100,
|
||||
options=options,
|
||||
strategy=strategy,
|
||||
)
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
top_result = None
|
||||
if result.results:
|
||||
r = result.results[0]
|
||||
line = r.start_line or 0
|
||||
top_result = f"{r.path}:{line}"
|
||||
|
||||
return BenchmarkResult(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=elapsed_ms,
|
||||
num_results=len(result.results),
|
||||
top_result=top_result,
|
||||
)
|
||||
except Exception as e:
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
return BenchmarkResult(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=elapsed_ms,
|
||||
num_results=0,
|
||||
top_result=None,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
def run_benchmarks(
|
||||
source_path: Path,
|
||||
queries: List[str],
|
||||
strategies: List[str],
|
||||
warmup_runs: int = 2,
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> Dict[str, List[BenchmarkResult]]:
|
||||
"""Run benchmarks for all queries and strategies."""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Cascade Search Benchmark")
|
||||
print(f"{'='*60}")
|
||||
print(f"Source: {source_path}")
|
||||
print(f"Queries: {len(queries)}")
|
||||
print(f"Strategies: {strategies}")
|
||||
print(f"Warmup runs: {warmup_runs}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Initialize engine
|
||||
config = Config()
|
||||
registry = RegistryStore() # Uses default path
|
||||
registry.initialize()
|
||||
mapper = PathMapper() # Uses default path
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||
|
||||
results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
|
||||
|
||||
# Warmup phase
|
||||
if warmup_runs > 0:
|
||||
print(f"Running {warmup_runs} warmup queries...")
|
||||
warmup_query = queries[0] if queries else "test"
|
||||
for strategy in strategies:
|
||||
for _ in range(warmup_runs):
|
||||
try:
|
||||
run_single_benchmark(engine, warmup_query, source_path, strategy, options)
|
||||
except Exception:
|
||||
pass
|
||||
print("Warmup complete.\n")
|
||||
|
||||
# Benchmark phase
|
||||
total_runs = len(queries) * len(strategies)
|
||||
current_run = 0
|
||||
|
||||
for query in queries:
|
||||
for strategy in strategies:
|
||||
current_run += 1
|
||||
print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
|
||||
|
||||
result = run_single_benchmark(engine, query, source_path, strategy, options)
|
||||
results[strategy].append(result)
|
||||
|
||||
if result.error:
|
||||
print(f"ERROR: {result.error[:50]}")
|
||||
else:
|
||||
print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
|
||||
"""Generate summary statistics for each strategy."""
|
||||
summaries = {}
|
||||
|
||||
for strategy, benchmark_results in results.items():
|
||||
latencies = [r.latency_ms for r in benchmark_results if r.error is None]
|
||||
result_counts = [r.num_results for r in benchmark_results if r.error is None]
|
||||
errors = [r.error for r in benchmark_results if r.error is not None]
|
||||
|
||||
if latencies:
|
||||
summary = BenchmarkSummary(
|
||||
strategy=strategy,
|
||||
total_queries=len(benchmark_results),
|
||||
successful_queries=len(latencies),
|
||||
avg_latency_ms=statistics.mean(latencies),
|
||||
min_latency_ms=min(latencies),
|
||||
max_latency_ms=max(latencies),
|
||||
p50_latency_ms=percentile(latencies, 50),
|
||||
p95_latency_ms=percentile(latencies, 95),
|
||||
p99_latency_ms=percentile(latencies, 99),
|
||||
avg_results=statistics.mean(result_counts) if result_counts else 0,
|
||||
errors=errors,
|
||||
)
|
||||
else:
|
||||
summary = BenchmarkSummary(
|
||||
strategy=strategy,
|
||||
total_queries=len(benchmark_results),
|
||||
successful_queries=0,
|
||||
avg_latency_ms=0,
|
||||
min_latency_ms=0,
|
||||
max_latency_ms=0,
|
||||
p50_latency_ms=0,
|
||||
p95_latency_ms=0,
|
||||
p99_latency_ms=0,
|
||||
avg_results=0,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
summaries[strategy] = summary
|
||||
|
||||
return summaries
|
||||
|
||||
|
||||
def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
|
||||
"""Print formatted comparison table."""
|
||||
print(f"\n{'='*80}")
|
||||
print("BENCHMARK RESULTS COMPARISON")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Header
|
||||
print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
|
||||
print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
|
||||
|
||||
binary = summaries.get("binary")
|
||||
hybrid = summaries.get("hybrid")
|
||||
|
||||
if not binary or not hybrid:
|
||||
print("Missing results for comparison")
|
||||
return
|
||||
|
||||
metrics = [
|
||||
("Total Queries", binary.total_queries, hybrid.total_queries),
|
||||
("Successful", binary.successful_queries, hybrid.successful_queries),
|
||||
("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
|
||||
("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
|
||||
("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
|
||||
("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
|
||||
("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
|
||||
("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
|
||||
("Avg Results", binary.avg_results, hybrid.avg_results),
|
||||
]
|
||||
|
||||
for name, b_val, h_val in metrics:
|
||||
if isinstance(b_val, float):
|
||||
diff = b_val - h_val
|
||||
diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
|
||||
speedup = h_val / b_val if b_val > 0 else 0
|
||||
if "Latency" in name and speedup > 1:
|
||||
diff_str += f" ({speedup:.1f}x faster)"
|
||||
print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
|
||||
else:
|
||||
diff = b_val - h_val
|
||||
print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
|
||||
|
||||
# Errors
|
||||
print(f"\n{'Errors:':<25}")
|
||||
print(f" Binary: {len(binary.errors)}")
|
||||
for err in binary.errors[:3]:
|
||||
print(f" - {err[:60]}...")
|
||||
print(f" Hybrid: {len(hybrid.errors)}")
|
||||
for err in hybrid.errors[:3]:
|
||||
print(f" - {err[:60]}...")
|
||||
|
||||
# Winner
|
||||
print(f"\n{'='*80}")
|
||||
if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
|
||||
speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
|
||||
print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
|
||||
elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
|
||||
speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
|
||||
print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
|
||||
else:
|
||||
print("No clear winner (check errors)")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
|
||||
def save_results(
|
||||
results: Dict[str, List[BenchmarkResult]],
|
||||
summaries: Dict[str, BenchmarkSummary],
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Save benchmark results to JSON file."""
|
||||
data = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"summaries": {k: asdict(v) for k, v in summaries.items()},
|
||||
"details": {
|
||||
k: [asdict(r) for r in v]
|
||||
for k, v in results.items()
|
||||
},
|
||||
}
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print(f"Results saved to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
|
||||
parser.add_argument(
|
||||
"--source", "-s",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "src",
|
||||
help="Source directory to search (default: ./src)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries", "-q",
|
||||
type=int,
|
||||
default=len(DEFAULT_QUERIES),
|
||||
help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--warmup", "-w",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup runs (default: 2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=Path,
|
||||
default=Path(__file__).parent / "results" / "cascade_benchmark.json",
|
||||
help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strategies",
|
||||
nargs="+",
|
||||
default=["binary", "hybrid"],
|
||||
choices=["binary", "hybrid"],
|
||||
help="Strategies to benchmark (default: both)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate source path
|
||||
if not args.source.exists():
|
||||
print(f"Error: Source path does not exist: {args.source}")
|
||||
sys.exit(1)
|
||||
|
||||
# Select queries
|
||||
queries = DEFAULT_QUERIES[:args.queries]
|
||||
|
||||
# Run benchmarks
|
||||
try:
|
||||
results = run_benchmarks(
|
||||
source_path=args.source,
|
||||
queries=queries,
|
||||
strategies=args.strategies,
|
||||
warmup_runs=args.warmup,
|
||||
)
|
||||
|
||||
# Generate summaries
|
||||
summaries = summarize_results(results)
|
||||
|
||||
# Print comparison
|
||||
print_comparison_table(summaries)
|
||||
|
||||
# Save results
|
||||
save_results(results, summaries, args.output)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nBenchmark interrupted.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nBenchmark failed: {e}")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,365 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Compare labeled accuracy: staged(realtime LSP graph) vs dense_rerank.
|
||||
|
||||
This script measures retrieval "accuracy" against a labeled query set.
|
||||
Each query must provide a list of relevant file paths (relative to --source
|
||||
or absolute). We report:
|
||||
- Hit@K (any relevant file appears in top-K)
|
||||
- MRR@K (reciprocal rank of first relevant file within top-K)
|
||||
- Recall@K (fraction of relevant files present in top-K)
|
||||
|
||||
Example:
|
||||
python benchmarks/compare_accuracy_labeled.py --source ./src
|
||||
python benchmarks/compare_accuracy_labeled.py --queries-file benchmarks/accuracy_queries_codexlens.jsonl
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
# Add src to path (match other benchmark scripts)
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_codexlens.jsonl"
|
||||
|
||||
|
||||
def _now_ms() -> float:
|
||||
return time.perf_counter() * 1000.0
|
||||
|
||||
|
||||
def _normalize_path_key(path: str) -> str:
|
||||
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
|
||||
try:
|
||||
p = Path(path)
|
||||
# Don't explode on non-files like "<memory>".
|
||||
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
|
||||
norm = str(p.resolve())
|
||||
else:
|
||||
norm = str(p)
|
||||
except Exception:
|
||||
norm = path
|
||||
norm = norm.replace("/", "\\")
|
||||
if os.name == "nt":
|
||||
norm = norm.lower()
|
||||
return norm
|
||||
|
||||
|
||||
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
|
||||
if not path.is_file():
|
||||
raise SystemExit(f"Queries file does not exist: {path}")
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception as exc:
|
||||
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
|
||||
if not isinstance(item, dict) or "query" not in item:
|
||||
raise SystemExit(f"Invalid query item (expected object with 'query'): {item!r}")
|
||||
out.append(item)
|
||||
if limit is not None and len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
|
||||
out: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for p in paths:
|
||||
if p in seen:
|
||||
continue
|
||||
seen.add(p)
|
||||
out.append(p)
|
||||
if len(out) >= k:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
|
||||
for i, p in enumerate(topk_paths, start=1):
|
||||
if p in relevant:
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyRun:
|
||||
strategy: str
|
||||
latency_ms: float
|
||||
topk_paths: List[str]
|
||||
first_hit_rank: Optional[int]
|
||||
hit_at_k: bool
|
||||
recall_at_k: float
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryEval:
|
||||
query: str
|
||||
relevant_paths: List[str]
|
||||
staged: StrategyRun
|
||||
dense_rerank: StrategyRun
|
||||
|
||||
|
||||
def _run_strategy(
|
||||
engine: ChainSearchEngine,
|
||||
*,
|
||||
strategy: str,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
relevant: set[str],
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> StrategyRun:
|
||||
gc.collect()
|
||||
start_ms = _now_ms()
|
||||
try:
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy=strategy,
|
||||
)
|
||||
latency_ms = _now_ms() - start_ms
|
||||
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
|
||||
paths_norm = [_normalize_path_key(p) for p in paths_raw]
|
||||
topk = _dedup_topk(paths_norm, k=k)
|
||||
rank = _first_hit_rank(topk, relevant)
|
||||
hit = rank is not None
|
||||
recall = 0.0
|
||||
if relevant:
|
||||
recall = len(set(topk) & relevant) / float(len(relevant))
|
||||
return StrategyRun(
|
||||
strategy=strategy,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=topk,
|
||||
first_hit_rank=rank,
|
||||
hit_at_k=hit,
|
||||
recall_at_k=recall,
|
||||
error=None,
|
||||
)
|
||||
except Exception as exc:
|
||||
latency_ms = _now_ms() - start_ms
|
||||
return StrategyRun(
|
||||
strategy=strategy,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=[],
|
||||
first_hit_rank=None,
|
||||
hit_at_k=False,
|
||||
recall_at_k=0.0,
|
||||
error=repr(exc),
|
||||
)
|
||||
|
||||
|
||||
def _mrr(ranks: Sequence[Optional[int]]) -> float:
|
||||
vals = []
|
||||
for r in ranks:
|
||||
if r is None or r <= 0:
|
||||
vals.append(0.0)
|
||||
else:
|
||||
vals.append(1.0 / float(r))
|
||||
return statistics.mean(vals) if vals else 0.0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare labeled retrieval accuracy: staged(realtime) vs dense_rerank"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "src",
|
||||
help="Source directory to search (default: ./src)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file",
|
||||
type=Path,
|
||||
default=DEFAULT_QUERIES_FILE,
|
||||
help="JSONL file with {query, relevant_paths[]} per line",
|
||||
)
|
||||
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
|
||||
parser.add_argument("--k", type=int, default=10, help="Top-K for evaluation (default 10)")
|
||||
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
|
||||
parser.add_argument(
|
||||
"--staged-cluster-strategy",
|
||||
type=str,
|
||||
default="path",
|
||||
help="Config.staged_clustering_strategy override for staged (default: path)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stage2-mode",
|
||||
type=str,
|
||||
default="realtime",
|
||||
help="Config.staged_stage2_mode override for staged (default: realtime)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path(__file__).parent / "results" / "accuracy_labeled.json",
|
||||
help="Output JSON path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.source.exists():
|
||||
raise SystemExit(f"Source path does not exist: {args.source}")
|
||||
|
||||
labeled = _load_labeled_queries(args.queries_file, args.queries)
|
||||
if not labeled:
|
||||
raise SystemExit("No queries to run")
|
||||
|
||||
source_root = args.source.expanduser().resolve()
|
||||
|
||||
# Match CLI behavior: load settings + apply global/workspace .env overrides.
|
||||
config = Config.load()
|
||||
config.cascade_strategy = "staged"
|
||||
config.staged_stage2_mode = str(args.stage2_mode or "realtime").strip().lower()
|
||||
config.enable_staged_rerank = True
|
||||
config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower()
|
||||
# Stability: on some Windows setups, DirectML/ONNX can crash under load.
|
||||
config.embedding_use_gpu = False
|
||||
config.reranker_use_gpu = False
|
||||
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||
|
||||
def resolve_expected(paths: Sequence[str]) -> set[str]:
|
||||
out: set[str] = set()
|
||||
for p in paths:
|
||||
try:
|
||||
cand = Path(p)
|
||||
if not cand.is_absolute():
|
||||
cand = (source_root / cand).resolve()
|
||||
out.add(_normalize_path_key(str(cand)))
|
||||
except Exception:
|
||||
out.add(_normalize_path_key(p))
|
||||
return out
|
||||
|
||||
evaluations: List[QueryEval] = []
|
||||
|
||||
try:
|
||||
for i, item in enumerate(labeled, start=1):
|
||||
query = str(item.get("query", "")).strip()
|
||||
relevant_raw = item.get("relevant_paths") or []
|
||||
if not query:
|
||||
continue
|
||||
if not isinstance(relevant_raw, list) or not relevant_raw:
|
||||
raise SystemExit(f"Query item missing relevant_paths[]: {item!r}")
|
||||
relevant = resolve_expected([str(p) for p in relevant_raw])
|
||||
|
||||
print(f"[{i}/{len(labeled)}] {query}")
|
||||
|
||||
staged = _run_strategy(
|
||||
engine,
|
||||
strategy="staged",
|
||||
query=query,
|
||||
source_path=source_root,
|
||||
k=int(args.k),
|
||||
coarse_k=int(args.coarse_k),
|
||||
relevant=relevant,
|
||||
options=None,
|
||||
)
|
||||
dense = _run_strategy(
|
||||
engine,
|
||||
strategy="dense_rerank",
|
||||
query=query,
|
||||
source_path=source_root,
|
||||
k=int(args.k),
|
||||
coarse_k=int(args.coarse_k),
|
||||
relevant=relevant,
|
||||
options=None,
|
||||
)
|
||||
|
||||
evaluations.append(
|
||||
QueryEval(
|
||||
query=query,
|
||||
relevant_paths=[_normalize_path_key(str((source_root / p).resolve())) if not Path(p).is_absolute() else _normalize_path_key(p) for p in relevant_raw],
|
||||
staged=staged,
|
||||
dense_rerank=dense,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
engine.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
registry.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
staged_runs = [e.staged for e in evaluations]
|
||||
dense_runs = [e.dense_rerank for e in evaluations]
|
||||
|
||||
def mean(xs: Sequence[float]) -> float:
|
||||
return statistics.mean(xs) if xs else 0.0
|
||||
|
||||
staged_ranks = [r.first_hit_rank for r in staged_runs]
|
||||
dense_ranks = [r.first_hit_rank for r in dense_runs]
|
||||
|
||||
summary = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(evaluations),
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"staged": {
|
||||
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in staged_runs]),
|
||||
"mrr_at_k": _mrr(staged_ranks),
|
||||
"avg_recall_at_k": mean([r.recall_at_k for r in staged_runs]),
|
||||
"avg_latency_ms": mean([r.latency_ms for r in staged_runs if not r.error]),
|
||||
"errors": sum(1 for r in staged_runs if r.error),
|
||||
},
|
||||
"dense_rerank": {
|
||||
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in dense_runs]),
|
||||
"mrr_at_k": _mrr(dense_ranks),
|
||||
"avg_recall_at_k": mean([r.recall_at_k for r in dense_runs]),
|
||||
"avg_latency_ms": mean([r.latency_ms for r in dense_runs if not r.error]),
|
||||
"errors": sum(1 for r in dense_runs if r.error),
|
||||
},
|
||||
"config": {
|
||||
"staged_stage2_mode": config.staged_stage2_mode,
|
||||
"staged_clustering_strategy": config.staged_clustering_strategy,
|
||||
"enable_staged_rerank": bool(config.enable_staged_rerank),
|
||||
"reranker_backend": config.reranker_backend,
|
||||
"reranker_model": config.reranker_model,
|
||||
"embedding_backend": config.embedding_backend,
|
||||
"embedding_model": config.embedding_model,
|
||||
},
|
||||
}
|
||||
|
||||
payload = {"summary": summary, "evaluations": [asdict(e) for e in evaluations]}
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
|
||||
print("\n=== SUMMARY ===")
|
||||
print(json.dumps(summary, indent=2))
|
||||
print(f"\nSaved: {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,980 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Benchmark local-only staged stage2 modes for CCW smart_search queries.
|
||||
|
||||
This benchmark reuses the existing CodexLens benchmark style, but focuses on
|
||||
the real search intents that drive CCW `smart_search`. It evaluates:
|
||||
|
||||
1. `dense_rerank` baseline
|
||||
2. `staged` + `precomputed`
|
||||
3. `staged` + `realtime`
|
||||
4. `staged` + `static_global_graph`
|
||||
|
||||
Metrics:
|
||||
- Hit@K
|
||||
- MRR@K
|
||||
- Recall@K
|
||||
- latency (avg/p50/p95)
|
||||
|
||||
The runner is intentionally local-only. By default it uses:
|
||||
- embedding backend: `fastembed`
|
||||
- reranker backend: `onnx`
|
||||
|
||||
Examples:
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --dry-run
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --self-check
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --source .. --k 10
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --embedding-model code --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from copy import deepcopy
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.search.ranking import (
|
||||
QueryIntent,
|
||||
detect_query_intent,
|
||||
is_generated_artifact_path,
|
||||
is_test_file,
|
||||
query_prefers_lexical_search,
|
||||
query_targets_generated_files,
|
||||
)
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
DEFAULT_SOURCE = Path(__file__).resolve().parents[2]
|
||||
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_ccw_smart_search.jsonl"
|
||||
DEFAULT_OUTPUT = Path(__file__).parent / "results" / "ccw_smart_search_stage2.json"
|
||||
|
||||
VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph")
|
||||
VALID_LOCAL_EMBEDDING_BACKENDS = ("fastembed",)
|
||||
VALID_LOCAL_RERANKER_BACKENDS = ("onnx", "fastembed", "legacy")
|
||||
VALID_BASELINE_METHODS = ("auto", "fts", "hybrid")
|
||||
DEFAULT_LOCAL_ONNX_RERANKER_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2"
|
||||
|
||||
|
||||
def _now_ms() -> float:
|
||||
return time.perf_counter() * 1000.0
|
||||
|
||||
|
||||
def _normalize_path_key(path: str) -> str:
|
||||
try:
|
||||
candidate = Path(path)
|
||||
if str(candidate) and (candidate.is_absolute() or re.match(r"^[A-Za-z]:", str(candidate))):
|
||||
normalized = str(candidate.resolve())
|
||||
else:
|
||||
normalized = str(candidate)
|
||||
except Exception:
|
||||
normalized = path
|
||||
normalized = normalized.replace("/", "\\")
|
||||
if os.name == "nt":
|
||||
normalized = normalized.lower()
|
||||
return normalized
|
||||
|
||||
|
||||
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
|
||||
output: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for path in paths:
|
||||
if path in seen:
|
||||
continue
|
||||
seen.add(path)
|
||||
output.append(path)
|
||||
if len(output) >= k:
|
||||
break
|
||||
return output
|
||||
|
||||
|
||||
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
|
||||
for index, path in enumerate(topk_paths, start=1):
|
||||
if path in relevant:
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def _mrr(ranks: Sequence[Optional[int]]) -> float:
|
||||
values = [1.0 / rank for rank in ranks if rank and rank > 0]
|
||||
return statistics.mean(values) if values else 0.0
|
||||
|
||||
|
||||
def _mean(values: Sequence[float]) -> float:
|
||||
return statistics.mean(values) if values else 0.0
|
||||
|
||||
|
||||
def _percentile(values: Sequence[float], percentile: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
ordered = sorted(values)
|
||||
if len(ordered) == 1:
|
||||
return ordered[0]
|
||||
index = (len(ordered) - 1) * percentile
|
||||
lower = int(index)
|
||||
upper = min(lower + 1, len(ordered) - 1)
|
||||
if lower == upper:
|
||||
return ordered[lower]
|
||||
fraction = index - lower
|
||||
return ordered[lower] + (ordered[upper] - ordered[lower]) * fraction
|
||||
|
||||
|
||||
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
|
||||
if not path.is_file():
|
||||
raise SystemExit(f"Queries file does not exist: {path}")
|
||||
|
||||
output: List[Dict[str, Any]] = []
|
||||
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception as exc:
|
||||
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
|
||||
if not isinstance(item, dict) or "query" not in item or "relevant_paths" not in item:
|
||||
raise SystemExit(f"Invalid query item (expected object with query/relevant_paths): {item!r}")
|
||||
relevant_paths = item.get("relevant_paths")
|
||||
if not isinstance(relevant_paths, list) or not relevant_paths:
|
||||
raise SystemExit(f"Query item must include non-empty relevant_paths[]: {item!r}")
|
||||
output.append(item)
|
||||
if limit is not None and len(output) >= limit:
|
||||
break
|
||||
return output
|
||||
|
||||
|
||||
def _resolve_expected_paths(source_root: Path, paths: Sequence[str]) -> Tuple[List[str], set[str], List[str]]:
|
||||
resolved_display: List[str] = []
|
||||
resolved_keys: set[str] = set()
|
||||
missing: List[str] = []
|
||||
|
||||
for raw_path in paths:
|
||||
candidate = Path(raw_path)
|
||||
if not candidate.is_absolute():
|
||||
candidate = (source_root / candidate).resolve()
|
||||
if not candidate.exists():
|
||||
missing.append(str(candidate))
|
||||
resolved_display.append(str(candidate))
|
||||
resolved_keys.add(_normalize_path_key(str(candidate)))
|
||||
return resolved_display, resolved_keys, missing
|
||||
|
||||
|
||||
def _validate_local_only_backends(embedding_backend: str, reranker_backend: str) -> None:
|
||||
if embedding_backend not in VALID_LOCAL_EMBEDDING_BACKENDS:
|
||||
raise SystemExit(
|
||||
"This runner is local-only. "
|
||||
f"--embedding-backend must be one of {', '.join(VALID_LOCAL_EMBEDDING_BACKENDS)}; got {embedding_backend!r}"
|
||||
)
|
||||
if reranker_backend not in VALID_LOCAL_RERANKER_BACKENDS:
|
||||
raise SystemExit(
|
||||
"This runner is local-only. "
|
||||
f"--reranker-backend must be one of {', '.join(VALID_LOCAL_RERANKER_BACKENDS)}; got {reranker_backend!r}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_stage2_modes(stage2_modes: Sequence[str]) -> List[str]:
|
||||
normalized = [str(mode).strip().lower() for mode in stage2_modes if str(mode).strip()]
|
||||
if not normalized:
|
||||
raise SystemExit("At least one --stage2-modes entry is required")
|
||||
invalid = [mode for mode in normalized if mode not in VALID_STAGE2_MODES]
|
||||
if invalid:
|
||||
raise SystemExit(
|
||||
f"Invalid --stage2-modes entry: {invalid[0]} "
|
||||
f"(valid: {', '.join(VALID_STAGE2_MODES)})"
|
||||
)
|
||||
deduped: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for mode in normalized:
|
||||
if mode in seen:
|
||||
continue
|
||||
seen.add(mode)
|
||||
deduped.append(mode)
|
||||
return deduped
|
||||
|
||||
|
||||
def _validate_baseline_methods(methods: Sequence[str]) -> List[str]:
|
||||
normalized = [str(method).strip().lower() for method in methods if str(method).strip()]
|
||||
invalid = [method for method in normalized if method not in VALID_BASELINE_METHODS]
|
||||
if invalid:
|
||||
raise SystemExit(
|
||||
f"Invalid --baseline-methods entry: {invalid[0]} "
|
||||
f"(valid: {', '.join(VALID_BASELINE_METHODS)})"
|
||||
)
|
||||
deduped: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for method in normalized:
|
||||
if method in seen:
|
||||
continue
|
||||
seen.add(method)
|
||||
deduped.append(method)
|
||||
return deduped
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyRun:
|
||||
strategy_key: str
|
||||
strategy: str
|
||||
stage2_mode: Optional[str]
|
||||
effective_method: str
|
||||
execution_method: str
|
||||
latency_ms: float
|
||||
topk_paths: List[str]
|
||||
first_hit_rank: Optional[int]
|
||||
hit_at_k: bool
|
||||
recall_at_k: float
|
||||
generated_artifact_count: int
|
||||
test_file_count: int
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryEvaluation:
|
||||
query: str
|
||||
intent: Optional[str]
|
||||
notes: Optional[str]
|
||||
relevant_paths: List[str]
|
||||
runs: Dict[str, StrategyRun]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PairwiseDelta:
|
||||
mode_a: str
|
||||
mode_b: str
|
||||
hit_at_k_delta: float
|
||||
mrr_at_k_delta: float
|
||||
avg_recall_at_k_delta: float
|
||||
avg_latency_ms_delta: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategySpec:
|
||||
strategy_key: str
|
||||
strategy: str
|
||||
stage2_mode: Optional[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyRuntime:
|
||||
strategy_spec: StrategySpec
|
||||
config: Config
|
||||
registry: RegistryStore
|
||||
engine: ChainSearchEngine
|
||||
|
||||
|
||||
def _strategy_specs(
|
||||
stage2_modes: Sequence[str],
|
||||
include_dense_baseline: bool,
|
||||
*,
|
||||
baseline_methods: Sequence[str],
|
||||
) -> List[StrategySpec]:
|
||||
specs: List[StrategySpec] = []
|
||||
for method in baseline_methods:
|
||||
specs.append(StrategySpec(strategy_key=method, strategy=method, stage2_mode=None))
|
||||
if include_dense_baseline:
|
||||
specs.append(StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None))
|
||||
for stage2_mode in stage2_modes:
|
||||
specs.append(
|
||||
StrategySpec(
|
||||
strategy_key=f"staged:{stage2_mode}",
|
||||
strategy="staged",
|
||||
stage2_mode=stage2_mode,
|
||||
)
|
||||
)
|
||||
return specs
|
||||
|
||||
|
||||
def _build_strategy_runtime(base_config: Config, strategy_spec: StrategySpec) -> StrategyRuntime:
|
||||
runtime_config = deepcopy(base_config)
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=runtime_config)
|
||||
return StrategyRuntime(
|
||||
strategy_spec=strategy_spec,
|
||||
config=runtime_config,
|
||||
registry=registry,
|
||||
engine=engine,
|
||||
)
|
||||
|
||||
|
||||
def _select_effective_method(query: str, requested_method: str) -> str:
|
||||
requested = str(requested_method).strip().lower()
|
||||
if requested != "auto":
|
||||
return requested
|
||||
if query_targets_generated_files(query) or query_prefers_lexical_search(query):
|
||||
return "fts"
|
||||
intent = detect_query_intent(query)
|
||||
if intent == QueryIntent.KEYWORD:
|
||||
return "fts"
|
||||
if intent == QueryIntent.SEMANTIC:
|
||||
return "dense_rerank"
|
||||
return "hybrid"
|
||||
|
||||
|
||||
def _filter_dataset_by_query_match(
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
query_match: Optional[str],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Filter labeled queries by case-insensitive substring match."""
|
||||
needle = str(query_match or "").strip().casefold()
|
||||
if not needle:
|
||||
return list(dataset)
|
||||
return [
|
||||
dict(item)
|
||||
for item in dataset
|
||||
if needle in str(item.get("query", "")).casefold()
|
||||
]
|
||||
|
||||
|
||||
def _apply_query_limit(
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
query_limit: Optional[int],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply the optional query limit after any dataset-level filtering."""
|
||||
if query_limit is None:
|
||||
return list(dataset)
|
||||
return [dict(item) for item in list(dataset)[: max(0, int(query_limit))]]
|
||||
|
||||
|
||||
def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None:
|
||||
"""Persist a benchmark payload as UTF-8 JSON."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def _write_final_outputs(
|
||||
*,
|
||||
output_path: Path,
|
||||
progress_output: Optional[Path],
|
||||
payload: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Persist the final completed payload to both result and progress outputs."""
|
||||
_write_json_payload(output_path, payload)
|
||||
if progress_output is not None:
|
||||
_write_json_payload(progress_output, payload)
|
||||
|
||||
|
||||
def _make_progress_payload(
|
||||
*,
|
||||
args: argparse.Namespace,
|
||||
source_root: Path,
|
||||
strategy_specs: Sequence[StrategySpec],
|
||||
evaluations: Sequence[QueryEvaluation],
|
||||
query_index: int,
|
||||
total_queries: int,
|
||||
run_index: int,
|
||||
total_runs: int,
|
||||
current_query: str,
|
||||
current_strategy_key: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a partial progress snapshot for long benchmark runs."""
|
||||
return {
|
||||
"status": "running",
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(evaluations),
|
||||
"planned_query_count": total_queries,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
|
||||
"progress": {
|
||||
"completed_queries": query_index,
|
||||
"total_queries": total_queries,
|
||||
"completed_runs": run_index,
|
||||
"total_runs": total_runs,
|
||||
"current_query": current_query,
|
||||
"current_strategy_key": current_strategy_key,
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": evaluation.query,
|
||||
"intent": evaluation.intent,
|
||||
"notes": evaluation.notes,
|
||||
"relevant_paths": evaluation.relevant_paths,
|
||||
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
|
||||
}
|
||||
for evaluation in evaluations
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _make_search_options(method: str, *, k: int) -> SearchOptions:
|
||||
normalized = str(method).strip().lower()
|
||||
if normalized == "fts":
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=False,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=False,
|
||||
pure_vector=False,
|
||||
enable_cascade=False,
|
||||
)
|
||||
if normalized == "hybrid":
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=True,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=False,
|
||||
enable_cascade=False,
|
||||
)
|
||||
if normalized in {"dense_rerank", "staged"}:
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=True,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=False,
|
||||
enable_cascade=True,
|
||||
)
|
||||
raise ValueError(f"Unsupported benchmark method: {method}")
|
||||
|
||||
|
||||
def _run_strategy(
|
||||
engine: ChainSearchEngine,
|
||||
config: Config,
|
||||
*,
|
||||
strategy_spec: StrategySpec,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
relevant: set[str],
|
||||
) -> StrategyRun:
|
||||
gc.collect()
|
||||
effective_method = _select_effective_method(query, strategy_spec.strategy)
|
||||
execution_method = "cascade" if effective_method in {"dense_rerank", "staged"} else effective_method
|
||||
previous_cascade_strategy = getattr(config, "cascade_strategy", None)
|
||||
previous_stage2_mode = getattr(config, "staged_stage2_mode", None)
|
||||
|
||||
start_ms = _now_ms()
|
||||
try:
|
||||
options = _make_search_options(
|
||||
"staged" if strategy_spec.strategy == "staged" else effective_method,
|
||||
k=k,
|
||||
)
|
||||
if strategy_spec.strategy == "staged":
|
||||
config.cascade_strategy = "staged"
|
||||
if strategy_spec.stage2_mode:
|
||||
config.staged_stage2_mode = strategy_spec.stage2_mode
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy="staged",
|
||||
)
|
||||
elif effective_method == "dense_rerank":
|
||||
config.cascade_strategy = "dense_rerank"
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy="dense_rerank",
|
||||
)
|
||||
else:
|
||||
result = engine.search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
options=options,
|
||||
)
|
||||
latency_ms = _now_ms() - start_ms
|
||||
paths_raw = [item.path for item in (result.results or []) if getattr(item, "path", None)]
|
||||
topk = _dedup_topk((_normalize_path_key(path) for path in paths_raw), k=k)
|
||||
rank = _first_hit_rank(topk, relevant)
|
||||
recall = 0.0
|
||||
if relevant:
|
||||
recall = len(set(topk) & relevant) / float(len(relevant))
|
||||
return StrategyRun(
|
||||
strategy_key=strategy_spec.strategy_key,
|
||||
strategy=strategy_spec.strategy,
|
||||
stage2_mode=strategy_spec.stage2_mode,
|
||||
effective_method=effective_method,
|
||||
execution_method=execution_method,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=topk,
|
||||
first_hit_rank=rank,
|
||||
hit_at_k=rank is not None,
|
||||
recall_at_k=recall,
|
||||
generated_artifact_count=sum(1 for path in topk if is_generated_artifact_path(path)),
|
||||
test_file_count=sum(1 for path in topk if is_test_file(path)),
|
||||
error=None,
|
||||
)
|
||||
except Exception as exc:
|
||||
latency_ms = _now_ms() - start_ms
|
||||
return StrategyRun(
|
||||
strategy_key=strategy_spec.strategy_key,
|
||||
strategy=strategy_spec.strategy,
|
||||
stage2_mode=strategy_spec.stage2_mode,
|
||||
effective_method=effective_method,
|
||||
execution_method=execution_method,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=[],
|
||||
first_hit_rank=None,
|
||||
hit_at_k=False,
|
||||
recall_at_k=0.0,
|
||||
generated_artifact_count=0,
|
||||
test_file_count=0,
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
finally:
|
||||
config.cascade_strategy = previous_cascade_strategy
|
||||
config.staged_stage2_mode = previous_stage2_mode
|
||||
|
||||
|
||||
def _summarize_runs(runs: Sequence[StrategyRun]) -> Dict[str, Any]:
|
||||
latencies = [run.latency_ms for run in runs if not run.error]
|
||||
ranks = [run.first_hit_rank for run in runs]
|
||||
effective_method_counts: Dict[str, int] = {}
|
||||
for run in runs:
|
||||
effective_method_counts[run.effective_method] = effective_method_counts.get(run.effective_method, 0) + 1
|
||||
return {
|
||||
"query_count": len(runs),
|
||||
"hit_at_k": _mean([1.0 if run.hit_at_k else 0.0 for run in runs]),
|
||||
"mrr_at_k": _mrr(ranks),
|
||||
"avg_recall_at_k": _mean([run.recall_at_k for run in runs]),
|
||||
"avg_latency_ms": _mean(latencies),
|
||||
"p50_latency_ms": _percentile(latencies, 0.50),
|
||||
"p95_latency_ms": _percentile(latencies, 0.95),
|
||||
"avg_generated_artifact_count": _mean([float(run.generated_artifact_count) for run in runs]),
|
||||
"avg_test_file_count": _mean([float(run.test_file_count) for run in runs]),
|
||||
"runs_with_generated_artifacts": sum(1 for run in runs if run.generated_artifact_count > 0),
|
||||
"runs_with_test_files": sum(1 for run in runs if run.test_file_count > 0),
|
||||
"effective_methods": effective_method_counts,
|
||||
"errors": sum(1 for run in runs if run.error),
|
||||
}
|
||||
|
||||
|
||||
def _build_pairwise_deltas(stage2_summaries: Dict[str, Dict[str, Any]]) -> List[PairwiseDelta]:
|
||||
modes = list(stage2_summaries.keys())
|
||||
deltas: List[PairwiseDelta] = []
|
||||
for left_index in range(len(modes)):
|
||||
for right_index in range(left_index + 1, len(modes)):
|
||||
left = modes[left_index]
|
||||
right = modes[right_index]
|
||||
left_summary = stage2_summaries[left]
|
||||
right_summary = stage2_summaries[right]
|
||||
deltas.append(
|
||||
PairwiseDelta(
|
||||
mode_a=left,
|
||||
mode_b=right,
|
||||
hit_at_k_delta=left_summary["hit_at_k"] - right_summary["hit_at_k"],
|
||||
mrr_at_k_delta=left_summary["mrr_at_k"] - right_summary["mrr_at_k"],
|
||||
avg_recall_at_k_delta=left_summary["avg_recall_at_k"] - right_summary["avg_recall_at_k"],
|
||||
avg_latency_ms_delta=left_summary["avg_latency_ms"] - right_summary["avg_latency_ms"],
|
||||
)
|
||||
)
|
||||
return deltas
|
||||
|
||||
|
||||
def _make_plan_payload(
|
||||
*,
|
||||
args: argparse.Namespace,
|
||||
source_root: Path,
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
baseline_methods: Sequence[str],
|
||||
stage2_modes: Sequence[str],
|
||||
strategy_specs: Sequence[StrategySpec],
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"mode": "dry-run" if args.dry_run else "self-check",
|
||||
"local_only": True,
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(dataset),
|
||||
"query_match": args.query_match,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"baseline_methods": list(baseline_methods),
|
||||
"stage2_modes": list(stage2_modes),
|
||||
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
|
||||
"local_backends": {
|
||||
"embedding_backend": args.embedding_backend,
|
||||
"embedding_model": args.embedding_model,
|
||||
"reranker_backend": args.reranker_backend,
|
||||
"reranker_model": args.reranker_model,
|
||||
"embedding_use_gpu": bool(args.embedding_use_gpu),
|
||||
"reranker_use_gpu": bool(args.reranker_use_gpu),
|
||||
},
|
||||
"output": str(args.output),
|
||||
"progress_output": str(args.progress_output) if args.progress_output else None,
|
||||
"dataset_preview": [
|
||||
{
|
||||
"query": item.get("query"),
|
||||
"intent": item.get("intent"),
|
||||
"relevant_paths": item.get("relevant_paths"),
|
||||
}
|
||||
for item in list(dataset)[: min(3, len(dataset))]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
type=Path,
|
||||
default=DEFAULT_SOURCE,
|
||||
help="Source root to benchmark. Defaults to the repository root so CCW and CodexLens paths resolve together.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file",
|
||||
type=Path,
|
||||
default=DEFAULT_QUERIES_FILE,
|
||||
help="Labeled JSONL dataset of CCW smart_search queries",
|
||||
)
|
||||
parser.add_argument("--query-limit", type=int, default=None, help="Optional query limit")
|
||||
parser.add_argument(
|
||||
"--query-match",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional case-insensitive substring filter for selecting specific benchmark queries.",
|
||||
)
|
||||
parser.add_argument("--k", type=int, default=10, help="Top-k to evaluate")
|
||||
parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k")
|
||||
parser.add_argument(
|
||||
"--baseline-methods",
|
||||
nargs="*",
|
||||
default=list(VALID_BASELINE_METHODS),
|
||||
help="Requested smart_search baselines to compare before staged modes (valid: auto, fts, hybrid).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stage2-modes",
|
||||
nargs="*",
|
||||
default=list(VALID_STAGE2_MODES),
|
||||
help="Stage-2 modes to compare",
|
||||
)
|
||||
parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per strategy")
|
||||
parser.add_argument(
|
||||
"--embedding-backend",
|
||||
default="fastembed",
|
||||
help="Local embedding backend. This runner only accepts fastembed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
default="code",
|
||||
help="Embedding model/profile for the local embedding backend",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-use-gpu",
|
||||
action="store_true",
|
||||
help="Enable GPU acceleration for local embeddings. Off by default for stability.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-backend",
|
||||
default="onnx",
|
||||
help="Local reranker backend. Supported local values: onnx, fastembed, legacy.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-model",
|
||||
default=DEFAULT_LOCAL_ONNX_RERANKER_MODEL,
|
||||
help="Reranker model name for the local reranker backend",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-use-gpu",
|
||||
action="store_true",
|
||||
help="Enable GPU acceleration for the local reranker. Off by default for stability.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-dense-baseline",
|
||||
action="store_true",
|
||||
help="Only compare staged stage2 modes and skip the dense_rerank baseline.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Validate dataset/config and print the benchmark plan without running retrieval.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
help="Smoke-check the entrypoint by validating dataset, source paths, and stage matrix wiring.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=DEFAULT_OUTPUT,
|
||||
help="Output JSON path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--progress-output",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional JSON path updated after each query with partial progress and completed runs.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
source_root = args.source.expanduser().resolve()
|
||||
if not source_root.exists():
|
||||
raise SystemExit(f"Source path does not exist: {source_root}")
|
||||
if int(args.k) <= 0:
|
||||
raise SystemExit("--k must be > 0")
|
||||
if int(args.coarse_k) <= 0:
|
||||
raise SystemExit("--coarse-k must be > 0")
|
||||
if int(args.coarse_k) < int(args.k):
|
||||
raise SystemExit("--coarse-k must be >= --k")
|
||||
if int(args.warmup) < 0:
|
||||
raise SystemExit("--warmup must be >= 0")
|
||||
|
||||
embedding_backend = str(args.embedding_backend).strip().lower()
|
||||
reranker_backend = str(args.reranker_backend).strip().lower()
|
||||
_validate_local_only_backends(embedding_backend, reranker_backend)
|
||||
baseline_methods = _validate_baseline_methods(args.baseline_methods)
|
||||
stage2_modes = _validate_stage2_modes(args.stage2_modes)
|
||||
|
||||
dataset = _load_labeled_queries(args.queries_file, None)
|
||||
dataset = _filter_dataset_by_query_match(dataset, args.query_match)
|
||||
dataset = _apply_query_limit(dataset, args.query_limit)
|
||||
if not dataset:
|
||||
raise SystemExit("No queries to run")
|
||||
|
||||
missing_paths: List[str] = []
|
||||
for item in dataset:
|
||||
_, _, item_missing = _resolve_expected_paths(source_root, [str(path) for path in item["relevant_paths"]])
|
||||
missing_paths.extend(item_missing)
|
||||
if missing_paths:
|
||||
preview = ", ".join(missing_paths[:3])
|
||||
raise SystemExit(
|
||||
"Dataset relevant_paths do not resolve under the selected source root. "
|
||||
f"Examples: {preview}"
|
||||
)
|
||||
|
||||
strategy_specs = _strategy_specs(
|
||||
stage2_modes,
|
||||
include_dense_baseline=not args.skip_dense_baseline,
|
||||
baseline_methods=baseline_methods,
|
||||
)
|
||||
|
||||
if args.dry_run or args.self_check:
|
||||
payload = _make_plan_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
dataset=dataset,
|
||||
baseline_methods=baseline_methods,
|
||||
stage2_modes=stage2_modes,
|
||||
strategy_specs=strategy_specs,
|
||||
)
|
||||
if args.self_check:
|
||||
payload["status"] = "ok"
|
||||
payload["checks"] = {
|
||||
"dataset_loaded": True,
|
||||
"stage2_matrix_size": len(stage2_modes),
|
||||
"local_only_validation": True,
|
||||
"source_path_exists": True,
|
||||
}
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
return
|
||||
|
||||
config = Config.load()
|
||||
config.cascade_strategy = "staged"
|
||||
config.enable_staged_rerank = True
|
||||
config.enable_cross_encoder_rerank = True
|
||||
config.embedding_backend = embedding_backend
|
||||
config.embedding_model = str(args.embedding_model).strip()
|
||||
config.embedding_use_gpu = bool(args.embedding_use_gpu)
|
||||
config.embedding_auto_embed_missing = False
|
||||
config.reranker_backend = reranker_backend
|
||||
config.reranker_model = str(args.reranker_model).strip()
|
||||
config.reranker_use_gpu = bool(args.reranker_use_gpu)
|
||||
|
||||
strategy_runtimes = {
|
||||
spec.strategy_key: _build_strategy_runtime(config, spec)
|
||||
for spec in strategy_specs
|
||||
}
|
||||
|
||||
evaluations: List[QueryEvaluation] = []
|
||||
total_queries = len(dataset)
|
||||
total_runs = total_queries * len(strategy_specs)
|
||||
completed_runs = 0
|
||||
|
||||
try:
|
||||
if int(args.warmup) > 0:
|
||||
warm_query = str(dataset[0]["query"]).strip()
|
||||
warm_relevant_paths = [str(path) for path in dataset[0]["relevant_paths"]]
|
||||
_, warm_relevant, _ = _resolve_expected_paths(source_root, warm_relevant_paths)
|
||||
for spec in strategy_specs:
|
||||
runtime = strategy_runtimes[spec.strategy_key]
|
||||
for _ in range(int(args.warmup)):
|
||||
_run_strategy(
|
||||
runtime.engine,
|
||||
runtime.config,
|
||||
strategy_spec=spec,
|
||||
query=warm_query,
|
||||
source_path=source_root,
|
||||
k=min(int(args.k), 5),
|
||||
coarse_k=min(int(args.coarse_k), 50),
|
||||
relevant=warm_relevant,
|
||||
)
|
||||
|
||||
for index, item in enumerate(dataset, start=1):
|
||||
query = str(item.get("query", "")).strip()
|
||||
if not query:
|
||||
continue
|
||||
print(f"[query {index}/{total_queries}] {query}", flush=True)
|
||||
relevant_paths, relevant, _ = _resolve_expected_paths(
|
||||
source_root,
|
||||
[str(path) for path in item["relevant_paths"]],
|
||||
)
|
||||
runs: Dict[str, StrategyRun] = {}
|
||||
for spec in strategy_specs:
|
||||
if args.progress_output is not None:
|
||||
_write_json_payload(
|
||||
args.progress_output,
|
||||
_make_progress_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
strategy_specs=strategy_specs,
|
||||
evaluations=evaluations,
|
||||
query_index=index - 1,
|
||||
total_queries=total_queries,
|
||||
run_index=completed_runs,
|
||||
total_runs=total_runs,
|
||||
current_query=query,
|
||||
current_strategy_key=spec.strategy_key,
|
||||
),
|
||||
)
|
||||
print(
|
||||
f"[run {completed_runs + 1}/{total_runs}] "
|
||||
f"strategy={spec.strategy_key} query={query}",
|
||||
flush=True,
|
||||
)
|
||||
runtime = strategy_runtimes[spec.strategy_key]
|
||||
runs[spec.strategy_key] = _run_strategy(
|
||||
runtime.engine,
|
||||
runtime.config,
|
||||
strategy_spec=spec,
|
||||
query=query,
|
||||
source_path=source_root,
|
||||
k=int(args.k),
|
||||
coarse_k=int(args.coarse_k),
|
||||
relevant=relevant,
|
||||
)
|
||||
completed_runs += 1
|
||||
run = runs[spec.strategy_key]
|
||||
outcome = "error" if run.error else "ok"
|
||||
print(
|
||||
f"[done {completed_runs}/{total_runs}] "
|
||||
f"strategy={spec.strategy_key} outcome={outcome} "
|
||||
f"latency_ms={run.latency_ms:.2f} "
|
||||
f"first_hit_rank={run.first_hit_rank}",
|
||||
flush=True,
|
||||
)
|
||||
evaluations.append(
|
||||
QueryEvaluation(
|
||||
query=query,
|
||||
intent=str(item.get("intent")) if item.get("intent") is not None else None,
|
||||
notes=str(item.get("notes")) if item.get("notes") is not None else None,
|
||||
relevant_paths=relevant_paths,
|
||||
runs=runs,
|
||||
)
|
||||
)
|
||||
if args.progress_output is not None:
|
||||
_write_json_payload(
|
||||
args.progress_output,
|
||||
_make_progress_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
strategy_specs=strategy_specs,
|
||||
evaluations=evaluations,
|
||||
query_index=index,
|
||||
total_queries=total_queries,
|
||||
run_index=completed_runs,
|
||||
total_runs=total_runs,
|
||||
current_query=query,
|
||||
current_strategy_key="complete",
|
||||
),
|
||||
)
|
||||
finally:
|
||||
for runtime in strategy_runtimes.values():
|
||||
try:
|
||||
runtime.engine.close()
|
||||
except Exception:
|
||||
pass
|
||||
for runtime in strategy_runtimes.values():
|
||||
try:
|
||||
runtime.registry.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
strategy_summaries: Dict[str, Dict[str, Any]] = {}
|
||||
for spec in strategy_specs:
|
||||
spec_runs = [evaluation.runs[spec.strategy_key] for evaluation in evaluations if spec.strategy_key in evaluation.runs]
|
||||
summary = _summarize_runs(spec_runs)
|
||||
summary["strategy"] = spec.strategy
|
||||
summary["stage2_mode"] = spec.stage2_mode
|
||||
strategy_summaries[spec.strategy_key] = summary
|
||||
|
||||
stage2_mode_matrix = {
|
||||
mode: strategy_summaries[f"staged:{mode}"]
|
||||
for mode in stage2_modes
|
||||
if f"staged:{mode}" in strategy_summaries
|
||||
}
|
||||
pairwise_deltas = [asdict(item) for item in _build_pairwise_deltas(stage2_mode_matrix)]
|
||||
|
||||
payload = {
|
||||
"status": "completed",
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(evaluations),
|
||||
"query_match": args.query_match,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"local_only": True,
|
||||
"strategies": strategy_summaries,
|
||||
"stage2_mode_matrix": stage2_mode_matrix,
|
||||
"pairwise_stage2_deltas": pairwise_deltas,
|
||||
"config": {
|
||||
"embedding_backend": config.embedding_backend,
|
||||
"embedding_model": config.embedding_model,
|
||||
"embedding_use_gpu": bool(config.embedding_use_gpu),
|
||||
"reranker_backend": config.reranker_backend,
|
||||
"reranker_model": config.reranker_model,
|
||||
"reranker_use_gpu": bool(config.reranker_use_gpu),
|
||||
"enable_staged_rerank": bool(config.enable_staged_rerank),
|
||||
"enable_cross_encoder_rerank": bool(config.enable_cross_encoder_rerank),
|
||||
},
|
||||
"progress_output": str(args.progress_output) if args.progress_output else None,
|
||||
"evaluations": [
|
||||
{
|
||||
"query": evaluation.query,
|
||||
"intent": evaluation.intent,
|
||||
"notes": evaluation.notes,
|
||||
"relevant_paths": evaluation.relevant_paths,
|
||||
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
|
||||
}
|
||||
for evaluation in evaluations
|
||||
],
|
||||
}
|
||||
|
||||
_write_final_outputs(
|
||||
output_path=args.output,
|
||||
progress_output=args.progress_output,
|
||||
payload=payload,
|
||||
)
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,405 +0,0 @@
|
||||
"""Compare Binary Cascade and Vector semantic search methods.
|
||||
|
||||
This script compares the two semantic retrieval approaches:
|
||||
1. Binary Cascade: 256-bit binary vectors for coarse ranking
|
||||
2. Vector Dense: Full semantic embeddings with cosine similarity
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
|
||||
def get_filename(path: str) -> str:
|
||||
"""Extract filename from path."""
|
||||
if "\\" in path:
|
||||
return path.split("\\")[-1]
|
||||
elif "/" in path:
|
||||
return path.split("/")[-1]
|
||||
return path
|
||||
|
||||
|
||||
def find_binary_indexes(index_root: Path):
|
||||
"""Find all binary index files."""
|
||||
return list(index_root.rglob("_index_binary_vectors.bin"))
|
||||
|
||||
|
||||
# Test queries for semantic search comparison
|
||||
TEST_QUERIES = [
|
||||
"how to search code semantically",
|
||||
"embedding generation for files",
|
||||
"hybrid search with multiple backends",
|
||||
"parse python source code",
|
||||
"database storage for vectors",
|
||||
]
|
||||
|
||||
# Index paths
|
||||
INDEX_ROOT = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
|
||||
|
||||
|
||||
def test_vector_search(query: str, limit: int = 10):
|
||||
"""Test dense vector search."""
|
||||
try:
|
||||
from codexlens.semantic.factory import get_embedder
|
||||
|
||||
# Find an index with embeddings
|
||||
all_results = []
|
||||
total_time = 0
|
||||
|
||||
for index_db in INDEX_ROOT.rglob("_index.db"):
|
||||
vector_store = VectorStore(index_db)
|
||||
|
||||
if vector_store.count_chunks() == 0:
|
||||
continue
|
||||
|
||||
# Get embedder based on stored config
|
||||
model_config = vector_store.get_model_config()
|
||||
if model_config:
|
||||
backend = model_config.get("backend", "fastembed")
|
||||
model_name = model_config["model_name"]
|
||||
model_profile = model_config["model_profile"]
|
||||
if backend == "litellm":
|
||||
embedder = get_embedder(backend="litellm", model=model_name)
|
||||
else:
|
||||
embedder = get_embedder(backend="fastembed", profile=model_profile)
|
||||
else:
|
||||
embedder = get_embedder(backend="fastembed", profile="code")
|
||||
|
||||
start = time.perf_counter()
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(
|
||||
query_embedding=query_embedding,
|
||||
top_k=limit,
|
||||
min_score=0.0,
|
||||
return_full_content=True,
|
||||
)
|
||||
total_time += (time.perf_counter() - start) * 1000
|
||||
all_results.extend(results)
|
||||
|
||||
# Only need one successful search to get embedder initialized
|
||||
if results:
|
||||
break
|
||||
|
||||
# Sort by score and limit
|
||||
all_results.sort(key=lambda x: x.score, reverse=True)
|
||||
return all_results[:limit], total_time, None
|
||||
except Exception as e:
|
||||
return [], 0, str(e)
|
||||
|
||||
|
||||
|
||||
def test_binary_cascade_search(query: str, limit: int = 10):
|
||||
"""Test binary cascade search (binary coarse + dense fine ranking)."""
|
||||
try:
|
||||
from codexlens.semantic.ann_index import BinaryANNIndex
|
||||
from codexlens.indexing.embedding import CascadeEmbeddingBackend
|
||||
import numpy as np
|
||||
import sqlite3
|
||||
|
||||
# Find binary indexes
|
||||
binary_indexes = find_binary_indexes(INDEX_ROOT)
|
||||
if not binary_indexes:
|
||||
return [], 0, "No binary indexes found. Run 'codexlens cascade-index' first."
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
# Initialize cascade backend for query encoding
|
||||
cascade_backend = CascadeEmbeddingBackend()
|
||||
|
||||
# Encode query to binary and dense
|
||||
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade([query], batch_size=1)
|
||||
query_binary = binary_embeddings[0]
|
||||
query_dense = dense_embeddings[0]
|
||||
|
||||
all_results = []
|
||||
|
||||
for binary_index_path in binary_indexes:
|
||||
# Find corresponding index.db
|
||||
index_db = binary_index_path.parent / "_index.db"
|
||||
if not index_db.exists():
|
||||
continue
|
||||
|
||||
# Check if cascade embeddings exist
|
||||
conn = sqlite3.connect(index_db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
|
||||
)
|
||||
binary_count = cursor.fetchone()[0]
|
||||
if binary_count == 0:
|
||||
conn.close()
|
||||
continue
|
||||
except Exception:
|
||||
conn.close()
|
||||
continue
|
||||
|
||||
# Stage 1: Binary coarse search
|
||||
binary_index = BinaryANNIndex(index_db, dim=256)
|
||||
try:
|
||||
binary_index.load()
|
||||
except Exception:
|
||||
conn.close()
|
||||
continue
|
||||
|
||||
# Pack query for binary search
|
||||
from codexlens.indexing.embedding import pack_binary_embedding
|
||||
query_binary_packed = pack_binary_embedding(query_binary)
|
||||
|
||||
# Get top candidates
|
||||
coarse_limit = min(limit * 10, 100)
|
||||
# search returns (ids, distances) tuple
|
||||
coarse_ids, coarse_distances = binary_index.search(query_binary_packed, top_k=coarse_limit)
|
||||
|
||||
if not coarse_ids:
|
||||
conn.close()
|
||||
continue
|
||||
|
||||
# Stage 2: Dense reranking
|
||||
chunk_ids = coarse_ids
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
|
||||
cursor = conn.execute(
|
||||
f"""
|
||||
SELECT id, file_path, content, embedding_dense
|
||||
FROM semantic_chunks
|
||||
WHERE id IN ({placeholders}) AND embedding_dense IS NOT NULL
|
||||
""",
|
||||
chunk_ids
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# Compute dense scores
|
||||
for row in rows:
|
||||
chunk_id = row["id"]
|
||||
file_path = row["file_path"]
|
||||
content = row["content"]
|
||||
dense_blob = row["embedding_dense"]
|
||||
|
||||
if dense_blob:
|
||||
dense_vec = np.frombuffer(dense_blob, dtype=np.float32)
|
||||
# Cosine similarity
|
||||
score = float(np.dot(query_dense, dense_vec) / (
|
||||
np.linalg.norm(query_dense) * np.linalg.norm(dense_vec) + 1e-8
|
||||
))
|
||||
else:
|
||||
score = 0.0
|
||||
|
||||
all_results.append({
|
||||
"path": file_path,
|
||||
"score": score,
|
||||
"content": content[:200] + "..." if len(content) > 200 else content,
|
||||
})
|
||||
|
||||
conn.close()
|
||||
|
||||
# Sort by dense score and limit
|
||||
all_results.sort(key=lambda x: x["score"], reverse=True)
|
||||
final_results = all_results[:limit]
|
||||
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
|
||||
return final_results, elapsed, None
|
||||
except ImportError as e:
|
||||
return [], 0, f"Import error: {e}"
|
||||
except Exception as e:
|
||||
import traceback
|
||||
return [], 0, f"{str(e)}\n{traceback.format_exc()}"
|
||||
|
||||
|
||||
def print_results(method_name: str, results, elapsed: float, error: str = None):
|
||||
"""Print search results in a formatted way."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Method: {method_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if error:
|
||||
print(f"ERROR: {error}")
|
||||
return
|
||||
|
||||
print(f"Results: {len(results)}, Time: {elapsed:.1f}ms")
|
||||
print("-" * 60)
|
||||
|
||||
for i, r in enumerate(results[:5], 1):
|
||||
if isinstance(r, dict):
|
||||
path = r.get("path", "?")
|
||||
score = r.get("score", 0)
|
||||
content = r.get("content", "")[:80]
|
||||
else:
|
||||
path = getattr(r, "path", "?")
|
||||
score = getattr(r, "score", 0)
|
||||
content = getattr(r, "content", "")[:80] if hasattr(r, "content") else ""
|
||||
|
||||
filename = get_filename(path)
|
||||
print(f" {i}. [{score:.4f}] {filename}")
|
||||
if content:
|
||||
# Sanitize content for console output
|
||||
safe_content = content.encode('ascii', 'replace').decode('ascii')
|
||||
print(f" {safe_content}...")
|
||||
|
||||
|
||||
def compare_overlap(results1, results2, name1: str, name2: str):
|
||||
"""Compare result overlap between two methods."""
|
||||
def get_paths(results):
|
||||
paths = set()
|
||||
for r in results[:10]:
|
||||
if isinstance(r, dict):
|
||||
paths.add(r.get("path", ""))
|
||||
else:
|
||||
paths.add(getattr(r, "path", ""))
|
||||
return paths
|
||||
|
||||
paths1 = get_paths(results1)
|
||||
paths2 = get_paths(results2)
|
||||
|
||||
if not paths1 or not paths2:
|
||||
return 0.0
|
||||
|
||||
overlap = len(paths1 & paths2)
|
||||
union = len(paths1 | paths2)
|
||||
jaccard = overlap / union if union > 0 else 0.0
|
||||
|
||||
print(f" {name1} vs {name2}: {overlap} common files (Jaccard: {jaccard:.2f})")
|
||||
return jaccard
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("SEMANTIC SEARCH METHODS COMPARISON")
|
||||
print("Binary Cascade vs Vector Dense")
|
||||
print("=" * 70)
|
||||
|
||||
# Check prerequisites
|
||||
print("\n[Prerequisites Check]")
|
||||
print(f" Index Root: {INDEX_ROOT}")
|
||||
|
||||
binary_indexes = find_binary_indexes(INDEX_ROOT)
|
||||
print(f" Binary Indexes: {len(binary_indexes)} found")
|
||||
for bi in binary_indexes[:3]:
|
||||
print(f" - {bi.parent.name}/{bi.name}")
|
||||
if len(binary_indexes) > 3:
|
||||
print(f" ... and {len(binary_indexes) - 3} more")
|
||||
|
||||
# Aggregate statistics
|
||||
all_results = {
|
||||
"binary": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||
"vector": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||
}
|
||||
|
||||
overlap_scores = {"binary_vector": []}
|
||||
|
||||
for query in TEST_QUERIES:
|
||||
print(f"\n{'#'*70}")
|
||||
print(f"QUERY: \"{query}\"")
|
||||
print("#" * 70)
|
||||
|
||||
# Test each method
|
||||
binary_results, binary_time, binary_err = test_binary_cascade_search(query)
|
||||
vector_results, vector_time, vector_err = test_vector_search(query)
|
||||
|
||||
# Print results
|
||||
print_results("Binary Cascade (256-bit + Dense Rerank)", binary_results, binary_time, binary_err)
|
||||
print_results("Vector Dense (Semantic Embeddings)", vector_results, vector_time, vector_err)
|
||||
|
||||
# Update statistics
|
||||
if not binary_err:
|
||||
all_results["binary"]["total_results"] += len(binary_results)
|
||||
all_results["binary"]["total_time"] += binary_time
|
||||
all_results["binary"]["queries"] += 1
|
||||
else:
|
||||
all_results["binary"]["errors"].append(binary_err)
|
||||
|
||||
if not vector_err:
|
||||
all_results["vector"]["total_results"] += len(vector_results)
|
||||
all_results["vector"]["total_time"] += vector_time
|
||||
all_results["vector"]["queries"] += 1
|
||||
else:
|
||||
all_results["vector"]["errors"].append(vector_err)
|
||||
|
||||
# Compare overlap
|
||||
print("\n[Result Overlap Analysis]")
|
||||
if binary_results and vector_results:
|
||||
j = compare_overlap(binary_results, vector_results, "Binary", "Vector")
|
||||
overlap_scores["binary_vector"].append(j)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 70)
|
||||
print("SUMMARY STATISTICS")
|
||||
print("=" * 70)
|
||||
|
||||
for method, stats in all_results.items():
|
||||
queries = stats["queries"]
|
||||
if queries > 0:
|
||||
avg_results = stats["total_results"] / queries
|
||||
avg_time = stats["total_time"] / queries
|
||||
print(f"\n{method.upper()}:")
|
||||
print(f" Successful queries: {queries}/{len(TEST_QUERIES)}")
|
||||
print(f" Avg results: {avg_results:.1f}")
|
||||
print(f" Avg time: {avg_time:.1f}ms")
|
||||
else:
|
||||
print(f"\n{method.upper()}: No successful queries")
|
||||
if stats["errors"]:
|
||||
# Show truncated error
|
||||
err = stats["errors"][0]
|
||||
if len(err) > 200:
|
||||
err = err[:200] + "..."
|
||||
print(f" Error: {err}")
|
||||
|
||||
print("\n[Average Overlap Scores]")
|
||||
for pair, scores in overlap_scores.items():
|
||||
if scores:
|
||||
avg = sum(scores) / len(scores)
|
||||
print(f" {pair}: {avg:.3f}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("ANALYSIS")
|
||||
print("=" * 70)
|
||||
|
||||
# Analyze working methods
|
||||
working_methods = [m for m, s in all_results.items() if s["queries"] > 0]
|
||||
|
||||
if len(working_methods) == 2:
|
||||
# All methods working - compare quality
|
||||
print("\nBoth methods working. Quality comparison:")
|
||||
|
||||
# Compare avg results
|
||||
print("\n Result Coverage (higher = more recall):")
|
||||
for m in ["vector", "binary"]:
|
||||
stats = all_results[m]
|
||||
if stats["queries"] > 0:
|
||||
avg = stats["total_results"] / stats["queries"]
|
||||
print(f" {m.upper()}: {avg:.1f} results/query")
|
||||
|
||||
# Compare speed
|
||||
print("\n Speed (lower = faster):")
|
||||
for m in ["binary", "vector"]:
|
||||
stats = all_results[m]
|
||||
if stats["queries"] > 0:
|
||||
avg = stats["total_time"] / stats["queries"]
|
||||
print(f" {m.upper()}: {avg:.1f}ms")
|
||||
|
||||
# Recommend fusion strategy
|
||||
print("\n Recommended Fusion Strategy:")
|
||||
print(" For quality-focused hybrid search:")
|
||||
print(" 1. Run both methods in parallel")
|
||||
print(" 2. Use RRF fusion with weights:")
|
||||
print(" - Vector: 0.6 (best semantic understanding)")
|
||||
print(" - Binary: 0.4 (fast coarse filtering)")
|
||||
print(" 3. Apply CrossEncoder reranking on top-50")
|
||||
|
||||
elif len(working_methods) >= 2:
|
||||
print(f"\n{len(working_methods)} methods working: {', '.join(working_methods)}")
|
||||
print("Consider fixing missing method for complete hybrid search.")
|
||||
else:
|
||||
print(f"\nOnly {working_methods[0] if working_methods else 'no'} method(s) working.")
|
||||
print("Check your index setup.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,393 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Compare staged realtime LSP pipeline vs direct dense->rerank cascade.
|
||||
|
||||
This benchmark compares two retrieval pipelines:
|
||||
1) staged+realtime: coarse (binary or dense fallback) -> realtime LSP graph expand -> clustering -> rerank
|
||||
2) dense_rerank: dense ANN coarse -> cross-encoder rerank
|
||||
|
||||
Because most repos do not have ground-truth labels, this script reports:
|
||||
- latency statistics
|
||||
- top-k overlap metrics (Jaccard + RBO)
|
||||
- diversity proxies (unique files/dirs)
|
||||
- staged pipeline stage stats (if present)
|
||||
|
||||
Usage:
|
||||
python benchmarks/compare_staged_realtime_vs_dense_rerank.py --source ./src
|
||||
python benchmarks/compare_staged_realtime_vs_dense_rerank.py --queries-file benchmarks/queries.txt
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
# Add src to path (match other benchmark scripts)
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
DEFAULT_QUERIES = [
|
||||
"class Config",
|
||||
"def search",
|
||||
"LspBridge",
|
||||
"graph expansion",
|
||||
"clustering strategy",
|
||||
"error handling",
|
||||
"how to parse json",
|
||||
]
|
||||
|
||||
|
||||
def _now_ms() -> float:
|
||||
return time.perf_counter() * 1000.0
|
||||
|
||||
|
||||
def _safe_relpath(path: str, root: Path) -> str:
|
||||
try:
|
||||
return str(Path(path).resolve().relative_to(root.resolve()))
|
||||
except Exception:
|
||||
return path
|
||||
|
||||
|
||||
def _normalize_path_key(path: str) -> str:
|
||||
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
|
||||
try:
|
||||
p = Path(path)
|
||||
# Don't explode on non-files like "<memory>".
|
||||
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
|
||||
norm = str(p.resolve())
|
||||
else:
|
||||
norm = str(p)
|
||||
except Exception:
|
||||
norm = path
|
||||
norm = norm.replace("/", "\\")
|
||||
if os.name == "nt":
|
||||
norm = norm.lower()
|
||||
return norm
|
||||
|
||||
|
||||
def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract STAGE_STATS JSON blob from SearchStats.errors."""
|
||||
for item in errors or []:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
if not item.startswith("STAGE_STATS:"):
|
||||
continue
|
||||
payload = item[len("STAGE_STATS:") :]
|
||||
try:
|
||||
return json.loads(payload)
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def jaccard_topk(a: List[str], b: List[str]) -> float:
|
||||
sa, sb = set(a), set(b)
|
||||
if not sa and not sb:
|
||||
return 1.0
|
||||
if not sa or not sb:
|
||||
return 0.0
|
||||
return len(sa & sb) / len(sa | sb)
|
||||
|
||||
|
||||
def rbo(a: List[str], b: List[str], p: float = 0.9) -> float:
|
||||
"""Rank-biased overlap for two ranked lists."""
|
||||
if p <= 0.0 or p >= 1.0:
|
||||
raise ValueError("p must be in (0, 1)")
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
|
||||
depth = max(len(a), len(b))
|
||||
seen_a: set[str] = set()
|
||||
seen_b: set[str] = set()
|
||||
|
||||
score = 0.0
|
||||
for d in range(1, depth + 1):
|
||||
if d <= len(a):
|
||||
seen_a.add(a[d - 1])
|
||||
if d <= len(b):
|
||||
seen_b.add(b[d - 1])
|
||||
overlap = len(seen_a & seen_b)
|
||||
score += (overlap / d) * ((1.0 - p) * (p ** (d - 1)))
|
||||
return score
|
||||
|
||||
|
||||
def _unique_parent_dirs(paths: Iterable[str]) -> int:
|
||||
dirs = set()
|
||||
for p in paths:
|
||||
try:
|
||||
dirs.add(str(Path(p).parent))
|
||||
except Exception:
|
||||
continue
|
||||
return len(dirs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunDetail:
|
||||
strategy: str
|
||||
query: str
|
||||
latency_ms: float
|
||||
num_results: int
|
||||
topk_paths: List[str]
|
||||
stage_stats: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompareDetail:
|
||||
query: str
|
||||
staged: RunDetail
|
||||
dense_rerank: RunDetail
|
||||
jaccard_topk: float
|
||||
rbo_topk: float
|
||||
staged_unique_files_topk: int
|
||||
dense_unique_files_topk: int
|
||||
staged_unique_dirs_topk: int
|
||||
dense_unique_dirs_topk: int
|
||||
|
||||
|
||||
def _run_once(
|
||||
engine: ChainSearchEngine,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
*,
|
||||
strategy: str,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> RunDetail:
|
||||
gc.collect()
|
||||
start_ms = _now_ms()
|
||||
try:
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy=strategy,
|
||||
)
|
||||
latency_ms = _now_ms() - start_ms
|
||||
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
|
||||
paths = [_normalize_path_key(p) for p in paths_raw]
|
||||
topk: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for p in paths:
|
||||
if p in seen:
|
||||
continue
|
||||
seen.add(p)
|
||||
topk.append(p)
|
||||
if len(topk) >= k:
|
||||
break
|
||||
stage_stats = _extract_stage_stats(getattr(result.stats, "errors", []))
|
||||
return RunDetail(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=latency_ms,
|
||||
num_results=len(paths),
|
||||
topk_paths=topk,
|
||||
stage_stats=stage_stats,
|
||||
)
|
||||
except Exception as exc:
|
||||
latency_ms = _now_ms() - start_ms
|
||||
return RunDetail(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=latency_ms,
|
||||
num_results=0,
|
||||
topk_paths=[],
|
||||
stage_stats=None,
|
||||
error=repr(exc),
|
||||
)
|
||||
|
||||
|
||||
def _load_queries(path: Optional[Path], limit: Optional[int]) -> List[str]:
|
||||
if path is None:
|
||||
queries = list(DEFAULT_QUERIES)
|
||||
else:
|
||||
raw = path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
||||
queries = []
|
||||
for line in raw:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
queries.append(line)
|
||||
if limit is not None:
|
||||
return queries[:limit]
|
||||
return queries
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare staged realtime LSP pipeline vs direct dense_rerank cascade"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "src",
|
||||
help="Source directory to search (default: ./src)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional file with one query per line (# comments supported)",
|
||||
)
|
||||
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
|
||||
parser.add_argument("--k", type=int, default=10, help="Final result count (default 10)")
|
||||
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
|
||||
parser.add_argument("--warmup", type=int, default=1, help="Warmup runs per strategy (default 1)")
|
||||
parser.add_argument(
|
||||
"--staged-cluster-strategy",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Override Config.staged_clustering_strategy for staged pipeline (e.g. auto, dir_rr, score, path)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path(__file__).parent / "results" / "staged_realtime_vs_dense_rerank.json",
|
||||
help="Output JSON path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.source.exists():
|
||||
raise SystemExit(f"Source path does not exist: {args.source}")
|
||||
|
||||
queries = _load_queries(args.queries_file, args.queries)
|
||||
if not queries:
|
||||
raise SystemExit("No queries to run")
|
||||
|
||||
# Match CLI behavior: load settings + apply global/workspace .env overrides.
|
||||
# This is important on Windows where ONNX/DirectML can sometimes crash under load;
|
||||
# many users pin EMBEDDING_BACKEND=litellm in ~/.codexlens/.env for stability.
|
||||
config = Config.load()
|
||||
config.cascade_strategy = "staged"
|
||||
config.staged_stage2_mode = "realtime"
|
||||
config.enable_staged_rerank = True
|
||||
if args.staged_cluster_strategy:
|
||||
config.staged_clustering_strategy = str(args.staged_cluster_strategy)
|
||||
# Stability: on some Windows setups, fastembed + DirectML can crash under load.
|
||||
# Force local embeddings and reranking onto CPU for reproducible benchmark runs.
|
||||
config.embedding_use_gpu = False
|
||||
config.reranker_use_gpu = False
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||
|
||||
try:
|
||||
strategies = ["staged", "dense_rerank"]
|
||||
|
||||
# Warmup
|
||||
if args.warmup > 0:
|
||||
warm_query = queries[0]
|
||||
for s in strategies:
|
||||
for _ in range(args.warmup):
|
||||
try:
|
||||
_run_once(
|
||||
engine,
|
||||
warm_query,
|
||||
args.source,
|
||||
strategy=s,
|
||||
k=min(args.k, 5),
|
||||
coarse_k=min(args.coarse_k, 50),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
comparisons: List[CompareDetail] = []
|
||||
|
||||
for i, query in enumerate(queries, start=1):
|
||||
print(f"[{i}/{len(queries)}] {query}")
|
||||
|
||||
staged = _run_once(
|
||||
engine,
|
||||
query,
|
||||
args.source,
|
||||
strategy="staged",
|
||||
k=args.k,
|
||||
coarse_k=args.coarse_k,
|
||||
)
|
||||
dense = _run_once(
|
||||
engine,
|
||||
query,
|
||||
args.source,
|
||||
strategy="dense_rerank",
|
||||
k=args.k,
|
||||
coarse_k=args.coarse_k,
|
||||
)
|
||||
|
||||
staged_paths = staged.topk_paths
|
||||
dense_paths = dense.topk_paths
|
||||
|
||||
comparisons.append(
|
||||
CompareDetail(
|
||||
query=query,
|
||||
staged=staged,
|
||||
dense_rerank=dense,
|
||||
jaccard_topk=jaccard_topk(staged_paths, dense_paths),
|
||||
rbo_topk=rbo(staged_paths, dense_paths, p=0.9),
|
||||
staged_unique_files_topk=len(set(staged_paths)),
|
||||
dense_unique_files_topk=len(set(dense_paths)),
|
||||
staged_unique_dirs_topk=_unique_parent_dirs(staged_paths),
|
||||
dense_unique_dirs_topk=_unique_parent_dirs(dense_paths),
|
||||
)
|
||||
)
|
||||
|
||||
def _latencies(details: List[RunDetail]) -> List[float]:
|
||||
return [d.latency_ms for d in details if not d.error]
|
||||
|
||||
staged_runs = [c.staged for c in comparisons]
|
||||
dense_runs = [c.dense_rerank for c in comparisons]
|
||||
|
||||
summary = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(args.source),
|
||||
"k": args.k,
|
||||
"coarse_k": args.coarse_k,
|
||||
"query_count": len(comparisons),
|
||||
"avg_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0,
|
||||
"avg_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0,
|
||||
"staged": {
|
||||
"success": sum(1 for r in staged_runs if not r.error),
|
||||
"avg_latency_ms": statistics.mean(_latencies(staged_runs)) if _latencies(staged_runs) else 0.0,
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": sum(1 for r in dense_runs if not r.error),
|
||||
"avg_latency_ms": statistics.mean(_latencies(dense_runs)) if _latencies(dense_runs) else 0.0,
|
||||
},
|
||||
}
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"summary": summary,
|
||||
"comparisons": [asdict(c) for c in comparisons],
|
||||
}
|
||||
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
print(f"\nSaved: {args.output}")
|
||||
finally:
|
||||
try:
|
||||
engine.close()
|
||||
except Exception as exc:
|
||||
print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr)
|
||||
try:
|
||||
registry.close()
|
||||
except Exception as exc:
|
||||
print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,391 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Compare staged cascade Stage-2 modes (precomputed vs realtime vs static graph).
|
||||
|
||||
This benchmark compares the *same* staged cascade strategy with different Stage-2
|
||||
expansion sources:
|
||||
|
||||
1) precomputed: per-dir `graph_neighbors` expansion (fast, index-local)
|
||||
2) realtime: live LSP graph expansion (contextual, requires LSP availability)
|
||||
3) static_global_graph: global_relationships expansion (project-wide, requires static graph indexing)
|
||||
|
||||
Because most repos do not have ground-truth labels, this script reports:
|
||||
- latency statistics per mode
|
||||
- top-k overlap metrics (Jaccard + RBO) between modes
|
||||
- diversity proxies (unique files/dirs)
|
||||
- staged pipeline stage stats (when present)
|
||||
|
||||
Usage:
|
||||
python benchmarks/compare_staged_stage2_modes.py --source ./src
|
||||
python benchmarks/compare_staged_stage2_modes.py --queries-file benchmarks/queries.txt
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
# Add src to path (match other benchmark scripts)
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
DEFAULT_QUERIES = [
|
||||
"class Config",
|
||||
"def search",
|
||||
"LspBridge",
|
||||
"graph expansion",
|
||||
"static graph relationships",
|
||||
"clustering strategy",
|
||||
"error handling",
|
||||
]
|
||||
|
||||
|
||||
VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph")
|
||||
|
||||
|
||||
def _now_ms() -> float:
|
||||
return time.perf_counter() * 1000.0
|
||||
|
||||
|
||||
def _normalize_path_key(path: str) -> str:
|
||||
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
|
||||
try:
|
||||
p = Path(path)
|
||||
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
|
||||
norm = str(p.resolve())
|
||||
else:
|
||||
norm = str(p)
|
||||
except Exception:
|
||||
norm = path
|
||||
norm = norm.replace("/", "\\")
|
||||
if os.name == "nt":
|
||||
norm = norm.lower()
|
||||
return norm
|
||||
|
||||
|
||||
def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract STAGE_STATS JSON blob from SearchStats.errors."""
|
||||
for item in errors or []:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
if not item.startswith("STAGE_STATS:"):
|
||||
continue
|
||||
payload = item[len("STAGE_STATS:") :]
|
||||
try:
|
||||
return json.loads(payload)
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def jaccard_topk(a: List[str], b: List[str]) -> float:
|
||||
sa, sb = set(a), set(b)
|
||||
if not sa and not sb:
|
||||
return 1.0
|
||||
if not sa or not sb:
|
||||
return 0.0
|
||||
return len(sa & sb) / len(sa | sb)
|
||||
|
||||
|
||||
def rbo(a: List[str], b: List[str], p: float = 0.9) -> float:
|
||||
"""Rank-biased overlap for two ranked lists."""
|
||||
if p <= 0.0 or p >= 1.0:
|
||||
raise ValueError("p must be in (0, 1)")
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
|
||||
depth = max(len(a), len(b))
|
||||
seen_a: set[str] = set()
|
||||
seen_b: set[str] = set()
|
||||
|
||||
score = 0.0
|
||||
for d in range(1, depth + 1):
|
||||
if d <= len(a):
|
||||
seen_a.add(a[d - 1])
|
||||
if d <= len(b):
|
||||
seen_b.add(b[d - 1])
|
||||
overlap = len(seen_a & seen_b)
|
||||
score += (overlap / d) * ((1.0 - p) * (p ** (d - 1)))
|
||||
return score
|
||||
|
||||
|
||||
def _unique_parent_dirs(paths: Iterable[str]) -> int:
|
||||
dirs = set()
|
||||
for p in paths:
|
||||
try:
|
||||
dirs.add(str(Path(p).parent))
|
||||
except Exception:
|
||||
continue
|
||||
return len(dirs)
|
||||
|
||||
|
||||
def _load_queries(path: Optional[Path], inline: Optional[List[str]]) -> List[str]:
|
||||
if inline:
|
||||
return [q.strip() for q in inline if isinstance(q, str) and q.strip()]
|
||||
if path:
|
||||
if not path.exists():
|
||||
raise SystemExit(f"Queries file does not exist: {path}")
|
||||
raw = path.read_text(encoding="utf-8", errors="ignore")
|
||||
queries = [line.strip() for line in raw.splitlines() if line.strip() and not line.strip().startswith("#")]
|
||||
return queries
|
||||
return list(DEFAULT_QUERIES)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunDetail:
|
||||
stage2_mode: str
|
||||
query: str
|
||||
latency_ms: float
|
||||
num_results: int
|
||||
topk_paths: List[str]
|
||||
stage_stats: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PairwiseCompare:
|
||||
query: str
|
||||
mode_a: str
|
||||
mode_b: str
|
||||
jaccard_topk: float
|
||||
rbo_topk: float
|
||||
a_unique_files_topk: int
|
||||
b_unique_files_topk: int
|
||||
a_unique_dirs_topk: int
|
||||
b_unique_dirs_topk: int
|
||||
|
||||
|
||||
def _run_once(
|
||||
engine: ChainSearchEngine,
|
||||
config: Config,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
*,
|
||||
stage2_mode: str,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
) -> RunDetail:
|
||||
if stage2_mode not in VALID_STAGE2_MODES:
|
||||
raise ValueError(f"Invalid stage2_mode: {stage2_mode}")
|
||||
|
||||
# Mutate config for this run; ChainSearchEngine reads config fields per-call.
|
||||
config.staged_stage2_mode = stage2_mode
|
||||
|
||||
gc.collect()
|
||||
start_ms = _now_ms()
|
||||
try:
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
strategy="staged",
|
||||
)
|
||||
latency_ms = _now_ms() - start_ms
|
||||
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
|
||||
paths = [_normalize_path_key(p) for p in paths_raw]
|
||||
|
||||
topk: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for p in paths:
|
||||
if p in seen:
|
||||
continue
|
||||
seen.add(p)
|
||||
topk.append(p)
|
||||
if len(topk) >= k:
|
||||
break
|
||||
|
||||
stage_stats = None
|
||||
try:
|
||||
stage_stats = _extract_stage_stats(getattr(result.stats, "errors", []) or [])
|
||||
except Exception:
|
||||
stage_stats = None
|
||||
|
||||
return RunDetail(
|
||||
stage2_mode=stage2_mode,
|
||||
query=query,
|
||||
latency_ms=latency_ms,
|
||||
num_results=len(result.results or []),
|
||||
topk_paths=topk,
|
||||
stage_stats=stage_stats,
|
||||
error=None,
|
||||
)
|
||||
except Exception as exc:
|
||||
return RunDetail(
|
||||
stage2_mode=stage2_mode,
|
||||
query=query,
|
||||
latency_ms=_now_ms() - start_ms,
|
||||
num_results=0,
|
||||
topk_paths=[],
|
||||
stage_stats=None,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Compare staged Stage-2 expansion modes.")
|
||||
parser.add_argument("--source", type=Path, default=Path.cwd(), help="Project path to search")
|
||||
parser.add_argument("--queries-file", type=Path, default=None, help="Optional newline-delimited queries file")
|
||||
parser.add_argument("--queries", nargs="*", default=None, help="Inline queries (overrides queries-file)")
|
||||
parser.add_argument("--k", type=int, default=20, help="Top-k to evaluate")
|
||||
parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k")
|
||||
parser.add_argument(
|
||||
"--stage2-modes",
|
||||
nargs="*",
|
||||
default=list(VALID_STAGE2_MODES),
|
||||
help="Stage-2 modes to compare",
|
||||
)
|
||||
parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per mode")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path(__file__).parent / "results" / "staged_stage2_modes.json",
|
||||
help="Output JSON path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.source.exists():
|
||||
raise SystemExit(f"Source path does not exist: {args.source}")
|
||||
|
||||
stage2_modes = [str(m).strip().lower() for m in (args.stage2_modes or []) if str(m).strip()]
|
||||
for m in stage2_modes:
|
||||
if m not in VALID_STAGE2_MODES:
|
||||
raise SystemExit(f"Invalid --stage2-modes entry: {m} (valid: {', '.join(VALID_STAGE2_MODES)})")
|
||||
|
||||
queries = _load_queries(args.queries_file, args.queries)
|
||||
if not queries:
|
||||
raise SystemExit("No queries to run")
|
||||
|
||||
# Match CLI behavior: load settings + apply global/workspace .env overrides.
|
||||
config = Config.load()
|
||||
config.cascade_strategy = "staged"
|
||||
config.enable_staged_rerank = True
|
||||
config.embedding_use_gpu = False # stability on some Windows setups
|
||||
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||
|
||||
try:
|
||||
# Warmup
|
||||
if args.warmup > 0:
|
||||
warm_query = queries[0]
|
||||
for mode in stage2_modes:
|
||||
for _ in range(args.warmup):
|
||||
try:
|
||||
_run_once(
|
||||
engine,
|
||||
config,
|
||||
warm_query,
|
||||
args.source,
|
||||
stage2_mode=mode,
|
||||
k=min(args.k, 5),
|
||||
coarse_k=min(args.coarse_k, 50),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
per_query: Dict[str, Dict[str, RunDetail]] = {}
|
||||
runs: List[RunDetail] = []
|
||||
comparisons: List[PairwiseCompare] = []
|
||||
|
||||
for i, query in enumerate(queries, start=1):
|
||||
print(f"[{i}/{len(queries)}] {query}")
|
||||
per_query[query] = {}
|
||||
|
||||
for mode in stage2_modes:
|
||||
detail = _run_once(
|
||||
engine,
|
||||
config,
|
||||
query,
|
||||
args.source,
|
||||
stage2_mode=mode,
|
||||
k=args.k,
|
||||
coarse_k=args.coarse_k,
|
||||
)
|
||||
per_query[query][mode] = detail
|
||||
runs.append(detail)
|
||||
|
||||
# Pairwise overlaps for this query
|
||||
for a_idx in range(len(stage2_modes)):
|
||||
for b_idx in range(a_idx + 1, len(stage2_modes)):
|
||||
mode_a = stage2_modes[a_idx]
|
||||
mode_b = stage2_modes[b_idx]
|
||||
a = per_query[query][mode_a]
|
||||
b = per_query[query][mode_b]
|
||||
comparisons.append(
|
||||
PairwiseCompare(
|
||||
query=query,
|
||||
mode_a=mode_a,
|
||||
mode_b=mode_b,
|
||||
jaccard_topk=jaccard_topk(a.topk_paths, b.topk_paths),
|
||||
rbo_topk=rbo(a.topk_paths, b.topk_paths, p=0.9),
|
||||
a_unique_files_topk=len(set(a.topk_paths)),
|
||||
b_unique_files_topk=len(set(b.topk_paths)),
|
||||
a_unique_dirs_topk=_unique_parent_dirs(a.topk_paths),
|
||||
b_unique_dirs_topk=_unique_parent_dirs(b.topk_paths),
|
||||
)
|
||||
)
|
||||
|
||||
def _latencies(details: List[RunDetail]) -> List[float]:
|
||||
return [d.latency_ms for d in details if not d.error]
|
||||
|
||||
mode_summaries: Dict[str, Dict[str, Any]] = {}
|
||||
for mode in stage2_modes:
|
||||
mode_runs = [r for r in runs if r.stage2_mode == mode]
|
||||
lat = _latencies(mode_runs)
|
||||
mode_summaries[mode] = {
|
||||
"success": sum(1 for r in mode_runs if not r.error),
|
||||
"avg_latency_ms": statistics.mean(lat) if lat else 0.0,
|
||||
"p50_latency_ms": statistics.median(lat) if lat else 0.0,
|
||||
"p95_latency_ms": statistics.quantiles(lat, n=20)[18] if len(lat) >= 2 else (lat[0] if lat else 0.0),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(args.source),
|
||||
"k": args.k,
|
||||
"coarse_k": args.coarse_k,
|
||||
"query_count": len(queries),
|
||||
"stage2_modes": stage2_modes,
|
||||
"modes": mode_summaries,
|
||||
"avg_pairwise_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0,
|
||||
"avg_pairwise_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0,
|
||||
}
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"summary": summary,
|
||||
"runs": [asdict(r) for r in runs],
|
||||
"comparisons": [asdict(c) for c in comparisons],
|
||||
}
|
||||
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
print(f"\nSaved: {args.output}")
|
||||
finally:
|
||||
try:
|
||||
engine.close()
|
||||
except Exception as exc:
|
||||
print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr)
|
||||
try:
|
||||
registry.close()
|
||||
except Exception as exc:
|
||||
print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,527 +0,0 @@
|
||||
"""Analysis script for hybrid search method contribution and storage architecture.
|
||||
|
||||
This script analyzes:
|
||||
1. Individual method contribution in hybrid search (FTS/Vector)
|
||||
2. Storage architecture conflicts between different retrieval methods
|
||||
3. FTS + Rerank fusion experiment
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Any
|
||||
from collections import defaultdict
|
||||
|
||||
# Add project root to path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
from codexlens.search.ranking import (
|
||||
reciprocal_rank_fusion,
|
||||
cross_encoder_rerank,
|
||||
DEFAULT_WEIGHTS,
|
||||
)
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
def find_project_index(source_path: Path) -> Path:
|
||||
"""Find the index database for a project."""
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
|
||||
mapper = PathMapper()
|
||||
index_path = mapper.source_to_index_db(source_path)
|
||||
|
||||
if not index_path.exists():
|
||||
nearest = registry.find_nearest_index(source_path)
|
||||
if nearest:
|
||||
index_path = nearest.index_path
|
||||
|
||||
registry.close()
|
||||
return index_path
|
||||
|
||||
|
||||
def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]:
|
||||
"""Analyze storage tables and check for conflicts.
|
||||
|
||||
Returns:
|
||||
Dictionary with table analysis and conflict detection.
|
||||
"""
|
||||
results = {
|
||||
"tables": {},
|
||||
"conflicts": [],
|
||||
"recommendations": []
|
||||
}
|
||||
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
# Get all tables
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
||||
)
|
||||
tables = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
for table in tables:
|
||||
# Get row count and columns
|
||||
try:
|
||||
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
|
||||
cols = conn.execute(f"PRAGMA table_info({table})").fetchall()
|
||||
col_names = [c[1] for c in cols]
|
||||
|
||||
results["tables"][table] = {
|
||||
"row_count": count,
|
||||
"columns": col_names
|
||||
}
|
||||
except Exception as e:
|
||||
results["tables"][table] = {"error": str(e)}
|
||||
|
||||
# Check for data overlap/conflicts
|
||||
# 1. Check if chunks and semantic_chunks have different data
|
||||
if "chunks" in tables and "semantic_chunks" in tables:
|
||||
chunks_count = results["tables"]["chunks"]["row_count"]
|
||||
semantic_count = results["tables"]["semantic_chunks"]["row_count"]
|
||||
|
||||
if chunks_count > 0 and semantic_count > 0:
|
||||
# Check for ID overlap
|
||||
overlap = conn.execute("""
|
||||
SELECT COUNT(*) FROM chunks c
|
||||
JOIN semantic_chunks sc ON c.id = sc.id
|
||||
""").fetchone()[0]
|
||||
|
||||
results["conflicts"].append({
|
||||
"type": "table_overlap",
|
||||
"tables": ["chunks", "semantic_chunks"],
|
||||
"chunks_count": chunks_count,
|
||||
"semantic_count": semantic_count,
|
||||
"id_overlap": overlap,
|
||||
"description": (
|
||||
f"Both chunks ({chunks_count}) and semantic_chunks ({semantic_count}) "
|
||||
f"have data. ID overlap: {overlap}. "
|
||||
"This can cause confusion - binary_cascade reads from semantic_chunks "
|
||||
"but SQLiteStore reads from chunks."
|
||||
)
|
||||
})
|
||||
elif chunks_count == 0 and semantic_count > 0:
|
||||
results["recommendations"].append(
|
||||
"chunks table is empty but semantic_chunks has data. "
|
||||
"Use cascade-index (semantic_chunks) for better semantic search."
|
||||
)
|
||||
elif chunks_count > 0 and semantic_count == 0:
|
||||
results["recommendations"].append(
|
||||
"semantic_chunks is empty. Run 'codexlens cascade-index' to enable "
|
||||
"binary cascade search."
|
||||
)
|
||||
|
||||
# 2. Check FTS tables
|
||||
fts_tables = [t for t in tables if t.startswith("files_fts")]
|
||||
if len(fts_tables) >= 2:
|
||||
results["recommendations"].append(
|
||||
f"Found {len(fts_tables)} FTS tables: {fts_tables}. "
|
||||
"Dual FTS (exact + fuzzy) is properly configured."
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def analyze_method_contributions(
|
||||
index_path: Path,
|
||||
queries: List[str],
|
||||
limit: int = 20
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze contribution of each retrieval method.
|
||||
|
||||
Runs each method independently and measures:
|
||||
- Result count
|
||||
- Latency
|
||||
- Score distribution
|
||||
- Overlap with other methods
|
||||
"""
|
||||
results = {
|
||||
"per_query": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
for query in queries:
|
||||
query_result = {
|
||||
"query": query,
|
||||
"methods": {},
|
||||
"fusion_analysis": {}
|
||||
}
|
||||
|
||||
# Run each method independently
|
||||
methods = {
|
||||
"fts_exact": {"fuzzy": False, "vector": False},
|
||||
"fts_fuzzy": {"fuzzy": True, "vector": False},
|
||||
"vector": {"fuzzy": False, "vector": True},
|
||||
}
|
||||
|
||||
method_results: Dict[str, List[SearchResult]] = {}
|
||||
|
||||
for method_name, config in methods.items():
|
||||
try:
|
||||
engine = HybridSearchEngine()
|
||||
|
||||
# Set config to disable/enable specific backends
|
||||
engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': method_name.startswith("fts"),
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
if method_name == "fts_exact":
|
||||
# Force FTS fallback mode with fuzzy disabled
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||
)
|
||||
elif method_name == "fts_fuzzy":
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=True, enable_vector=False, pure_vector=False
|
||||
)
|
||||
elif method_name == "vector":
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=True, pure_vector=True
|
||||
)
|
||||
else:
|
||||
results_list = []
|
||||
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
method_results[method_name] = results_list
|
||||
|
||||
scores = [r.score for r in results_list]
|
||||
query_result["methods"][method_name] = {
|
||||
"count": len(results_list),
|
||||
"latency_ms": latency,
|
||||
"avg_score": sum(scores) / len(scores) if scores else 0,
|
||||
"max_score": max(scores) if scores else 0,
|
||||
"min_score": min(scores) if scores else 0,
|
||||
"top_3_files": [r.path.split("\\")[-1] for r in results_list[:3]]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
query_result["methods"][method_name] = {
|
||||
"error": str(e),
|
||||
"count": 0
|
||||
}
|
||||
|
||||
# Compute overlap between methods
|
||||
method_paths = {
|
||||
name: set(r.path for r in results)
|
||||
for name, results in method_results.items()
|
||||
if results
|
||||
}
|
||||
|
||||
overlaps = {}
|
||||
method_names = list(method_paths.keys())
|
||||
for i, m1 in enumerate(method_names):
|
||||
for m2 in method_names[i+1:]:
|
||||
overlap = len(method_paths[m1] & method_paths[m2])
|
||||
union = len(method_paths[m1] | method_paths[m2])
|
||||
jaccard = overlap / union if union > 0 else 0
|
||||
overlaps[f"{m1}_vs_{m2}"] = {
|
||||
"overlap_count": overlap,
|
||||
"jaccard": jaccard,
|
||||
f"{m1}_unique": len(method_paths[m1] - method_paths[m2]),
|
||||
f"{m2}_unique": len(method_paths[m2] - method_paths[m1]),
|
||||
}
|
||||
|
||||
query_result["overlaps"] = overlaps
|
||||
|
||||
# Analyze RRF fusion contribution
|
||||
if len(method_results) >= 2:
|
||||
# Compute RRF with each method's contribution
|
||||
rrf_map = {}
|
||||
for name, results in method_results.items():
|
||||
if results and name in ["fts_exact", "vector"]:
|
||||
# Rename for RRF
|
||||
rrf_name = name.replace("fts_exact", "exact")
|
||||
rrf_map[rrf_name] = results
|
||||
|
||||
if rrf_map:
|
||||
fused = reciprocal_rank_fusion(rrf_map, k=60)
|
||||
|
||||
# Analyze which methods contributed to top results
|
||||
source_contributions = defaultdict(int)
|
||||
for r in fused[:10]:
|
||||
source_ranks = r.metadata.get("source_ranks", {})
|
||||
for source in source_ranks:
|
||||
source_contributions[source] += 1
|
||||
|
||||
query_result["fusion_analysis"] = {
|
||||
"total_fused": len(fused),
|
||||
"top_10_source_distribution": dict(source_contributions)
|
||||
}
|
||||
|
||||
results["per_query"].append(query_result)
|
||||
|
||||
# Compute summary statistics
|
||||
method_stats = defaultdict(lambda: {"counts": [], "latencies": []})
|
||||
for qr in results["per_query"]:
|
||||
for method, data in qr["methods"].items():
|
||||
if "count" in data:
|
||||
method_stats[method]["counts"].append(data["count"])
|
||||
if "latency_ms" in data:
|
||||
method_stats[method]["latencies"].append(data["latency_ms"])
|
||||
|
||||
results["summary"] = {
|
||||
method: {
|
||||
"avg_count": sum(s["counts"]) / len(s["counts"]) if s["counts"] else 0,
|
||||
"avg_latency_ms": sum(s["latencies"]) / len(s["latencies"]) if s["latencies"] else 0,
|
||||
}
|
||||
for method, s in method_stats.items()
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def experiment_fts_rerank_fusion(
|
||||
index_path: Path,
|
||||
queries: List[str],
|
||||
limit: int = 10,
|
||||
coarse_k: int = 50
|
||||
) -> Dict[str, Any]:
|
||||
"""Experiment: FTS + Rerank fusion vs standard hybrid.
|
||||
|
||||
Compares:
|
||||
1. Standard Hybrid (FTS + Vector RRF)
|
||||
2. FTS + CrossEncoder Rerank -> then fuse with Vector
|
||||
"""
|
||||
results = {
|
||||
"per_query": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
# Initialize reranker
|
||||
try:
|
||||
from codexlens.semantic.reranker import get_reranker, check_reranker_available
|
||||
ok, _ = check_reranker_available("onnx")
|
||||
if ok:
|
||||
reranker = get_reranker(backend="onnx", use_gpu=True)
|
||||
else:
|
||||
reranker = None
|
||||
except Exception as e:
|
||||
print(f"Reranker unavailable: {e}")
|
||||
reranker = None
|
||||
|
||||
for query in queries:
|
||||
query_result = {
|
||||
"query": query,
|
||||
"strategies": {}
|
||||
}
|
||||
|
||||
# Strategy 1: Standard Hybrid (FTS + Vector)
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': False,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
standard_results = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_vector=True
|
||||
)
|
||||
standard_latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
query_result["strategies"]["standard_hybrid"] = {
|
||||
"count": len(standard_results),
|
||||
"latency_ms": standard_latency,
|
||||
"top_5": [r.path.split("\\")[-1] for r in standard_results[:5]],
|
||||
"scores": [r.score for r in standard_results[:5]]
|
||||
}
|
||||
except Exception as e:
|
||||
query_result["strategies"]["standard_hybrid"] = {"error": str(e)}
|
||||
|
||||
# Strategy 2: FTS + Rerank -> Fuse with Vector
|
||||
try:
|
||||
# Step 1: Get FTS results (coarse)
|
||||
fts_engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
fts_engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': True,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
fts_results = fts_engine.search(
|
||||
index_path, query, limit=coarse_k,
|
||||
enable_fuzzy=True, enable_vector=False
|
||||
)
|
||||
fts_latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
# Step 2: Rerank FTS results with CrossEncoder
|
||||
if reranker and fts_results:
|
||||
rerank_start = time.perf_counter()
|
||||
reranked_fts = cross_encoder_rerank(
|
||||
query, fts_results, reranker, top_k=20
|
||||
)
|
||||
rerank_latency = (time.perf_counter() - rerank_start) * 1000
|
||||
else:
|
||||
reranked_fts = fts_results[:20]
|
||||
rerank_latency = 0
|
||||
|
||||
# Step 3: Get Vector results
|
||||
vector_engine = HybridSearchEngine()
|
||||
vector_results = vector_engine.search(
|
||||
index_path, query, limit=20,
|
||||
enable_vector=True, pure_vector=True
|
||||
)
|
||||
|
||||
# Step 4: Fuse reranked FTS with Vector
|
||||
if reranked_fts and vector_results:
|
||||
fusion_map = {
|
||||
"fts_reranked": reranked_fts,
|
||||
"vector": vector_results
|
||||
}
|
||||
fused_results = reciprocal_rank_fusion(
|
||||
fusion_map,
|
||||
weights={"fts_reranked": 0.5, "vector": 0.5},
|
||||
k=60
|
||||
)
|
||||
else:
|
||||
fused_results = reranked_fts or vector_results or []
|
||||
|
||||
total_latency = fts_latency + rerank_latency + (time.perf_counter() - start) * 1000
|
||||
|
||||
query_result["strategies"]["fts_rerank_fusion"] = {
|
||||
"count": len(fused_results),
|
||||
"total_latency_ms": fts_latency + rerank_latency,
|
||||
"fts_latency_ms": fts_latency,
|
||||
"rerank_latency_ms": rerank_latency,
|
||||
"top_5": [r.path.split("\\")[-1] for r in fused_results[:5]],
|
||||
"scores": [r.score for r in fused_results[:5]]
|
||||
}
|
||||
except Exception as e:
|
||||
query_result["strategies"]["fts_rerank_fusion"] = {"error": str(e)}
|
||||
|
||||
# Compute overlap between strategies
|
||||
if (
|
||||
"error" not in query_result["strategies"].get("standard_hybrid", {})
|
||||
and "error" not in query_result["strategies"].get("fts_rerank_fusion", {})
|
||||
):
|
||||
standard_paths = set(r.path.split("\\")[-1] for r in standard_results[:10])
|
||||
fts_rerank_paths = set(r.path.split("\\")[-1] for r in fused_results[:10])
|
||||
|
||||
overlap = len(standard_paths & fts_rerank_paths)
|
||||
query_result["comparison"] = {
|
||||
"top_10_overlap": overlap,
|
||||
"standard_unique": list(standard_paths - fts_rerank_paths)[:3],
|
||||
"fts_rerank_unique": list(fts_rerank_paths - standard_paths)[:3]
|
||||
}
|
||||
|
||||
results["per_query"].append(query_result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all analyses."""
|
||||
source_path = Path("D:/Claude_dms3/codex-lens/src")
|
||||
index_path = find_project_index(source_path)
|
||||
|
||||
print(f"Using index: {index_path}")
|
||||
print(f"Index exists: {index_path.exists()}")
|
||||
print()
|
||||
|
||||
# Test queries
|
||||
queries = [
|
||||
"binary quantization",
|
||||
"hamming distance search",
|
||||
"embeddings generation",
|
||||
"reranking algorithm",
|
||||
"database connection handling",
|
||||
]
|
||||
|
||||
# 1. Storage Architecture Analysis
|
||||
print("=" * 60)
|
||||
print("1. STORAGE ARCHITECTURE ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
storage_analysis = analyze_storage_architecture(index_path)
|
||||
|
||||
print("\nTable Overview:")
|
||||
for table, info in sorted(storage_analysis["tables"].items()):
|
||||
if "row_count" in info:
|
||||
print(f" {table}: {info['row_count']} rows")
|
||||
|
||||
print("\nConflicts Detected:")
|
||||
for conflict in storage_analysis["conflicts"]:
|
||||
print(f" - {conflict['description']}")
|
||||
|
||||
print("\nRecommendations:")
|
||||
for rec in storage_analysis["recommendations"]:
|
||||
print(f" - {rec}")
|
||||
|
||||
# 2. Method Contribution Analysis
|
||||
print("\n" + "=" * 60)
|
||||
print("2. METHOD CONTRIBUTION ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
contribution_analysis = analyze_method_contributions(index_path, queries)
|
||||
|
||||
print("\nPer-Query Results:")
|
||||
for qr in contribution_analysis["per_query"]:
|
||||
print(f"\n Query: '{qr['query']}'")
|
||||
for method, data in qr["methods"].items():
|
||||
if "error" not in data:
|
||||
print(f" {method}: {data['count']} results, {data['latency_ms']:.1f}ms")
|
||||
if data.get("top_3_files"):
|
||||
print(f" Top 3: {', '.join(data['top_3_files'])}")
|
||||
|
||||
if qr.get("overlaps"):
|
||||
print(" Overlaps:")
|
||||
for pair, info in qr["overlaps"].items():
|
||||
print(f" {pair}: {info['overlap_count']} common (Jaccard: {info['jaccard']:.2f})")
|
||||
|
||||
print("\nSummary:")
|
||||
for method, stats in contribution_analysis["summary"].items():
|
||||
print(f" {method}: avg {stats['avg_count']:.1f} results, {stats['avg_latency_ms']:.1f}ms")
|
||||
|
||||
# 3. FTS + Rerank Fusion Experiment
|
||||
print("\n" + "=" * 60)
|
||||
print("3. FTS + RERANK FUSION EXPERIMENT")
|
||||
print("=" * 60)
|
||||
|
||||
fusion_experiment = experiment_fts_rerank_fusion(index_path, queries)
|
||||
|
||||
print("\nPer-Query Comparison:")
|
||||
for qr in fusion_experiment["per_query"]:
|
||||
print(f"\n Query: '{qr['query']}'")
|
||||
for strategy, data in qr["strategies"].items():
|
||||
if "error" not in data:
|
||||
latency = data.get("total_latency_ms") or data.get("latency_ms", 0)
|
||||
print(f" {strategy}: {data['count']} results, {latency:.1f}ms")
|
||||
if data.get("top_5"):
|
||||
print(f" Top 5: {', '.join(data['top_5'][:3])}...")
|
||||
|
||||
if qr.get("comparison"):
|
||||
comp = qr["comparison"]
|
||||
print(f" Top-10 Overlap: {comp['top_10_overlap']}/10")
|
||||
|
||||
# Save full results
|
||||
output_path = Path(__file__).parent / "results" / "method_contribution_analysis.json"
|
||||
output_path.parent.mkdir(exist_ok=True)
|
||||
|
||||
full_results = {
|
||||
"storage_analysis": storage_analysis,
|
||||
"contribution_analysis": contribution_analysis,
|
||||
"fusion_experiment": fusion_experiment
|
||||
}
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(full_results, f, indent=2, default=str)
|
||||
|
||||
print(f"\n\nFull results saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,277 +0,0 @@
|
||||
{
|
||||
"timestamp": "2026-01-02 11:48:33",
|
||||
"summaries": {
|
||||
"binary": {
|
||||
"strategy": "binary",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 1133.4008666667312,
|
||||
"min_latency_ms": 959.5361000028788,
|
||||
"max_latency_ms": 1330.8978999993997,
|
||||
"p50_latency_ms": 1125.8439999946859,
|
||||
"p95_latency_ms": 1330.0081999987015,
|
||||
"p99_latency_ms": 1330.71995999926,
|
||||
"avg_results": 10,
|
||||
"errors": []
|
||||
},
|
||||
"hybrid": {
|
||||
"strategy": "hybrid",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 1111.1401133336283,
|
||||
"min_latency_ms": 857.0021999985329,
|
||||
"max_latency_ms": 1278.8890000010724,
|
||||
"p50_latency_ms": 1130.696000000171,
|
||||
"p95_latency_ms": 1254.2417899981956,
|
||||
"p99_latency_ms": 1273.959558000497,
|
||||
"avg_results": 10,
|
||||
"errors": []
|
||||
}
|
||||
},
|
||||
"details": {
|
||||
"binary": [
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "def search",
|
||||
"latency_ms": 1044.525999997859,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 1052.5979999947594,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 1217.217100005655,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "async def",
|
||||
"latency_ms": 1276.9802000038908,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 1005.9053000004496,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 1330.8978999993997,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "database connection",
|
||||
"latency_ms": 1041.6685000018333,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "error handling",
|
||||
"latency_ms": 959.5361000028788,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 1060.9395999999833,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "file read write",
|
||||
"latency_ms": 971.8680000005406,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 1135.879900000873,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 1188.1732000038028,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 1259.3522999959532,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 1329.6268999984022,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "reranking",
|
||||
"latency_ms": 1125.8439999946859,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
|
||||
"error": null
|
||||
}
|
||||
],
|
||||
"hybrid": [
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "def search",
|
||||
"latency_ms": 1117.0937999995658,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 1039.3984000038472,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 1144.7916999968584,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "async def",
|
||||
"latency_ms": 857.0021999985329,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 957.5578000003588,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 1216.5708000029554,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "database connection",
|
||||
"latency_ms": 1154.8929000055068,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "error handling",
|
||||
"latency_ms": 1130.696000000171,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 1112.8943000003346,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "file read write",
|
||||
"latency_ms": 1172.5986000019475,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 1278.8890000010724,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 1024.2393000007723,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 1243.6786999969627,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 1081.3100999948801,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "reranking",
|
||||
"latency_ms": 1135.4881000006571,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
|
||||
"error": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,526 +0,0 @@
|
||||
{
|
||||
"timestamp": "2026-03-14 23:16:55",
|
||||
"source": "D:\\Claude_dms3",
|
||||
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
|
||||
"query_count": 4,
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"local_only": true,
|
||||
"strategies": {
|
||||
"dense_rerank": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 20171.940174996853,
|
||||
"p50_latency_ms": 14222.247749984264,
|
||||
"p95_latency_ms": 35222.31535999476,
|
||||
"errors": 0,
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13679.793299987912,
|
||||
"p50_latency_ms": 12918.63379997015,
|
||||
"p95_latency_ms": 16434.964765003322,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"staged:realtime": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13885.101849973202,
|
||||
"p50_latency_ms": 13826.323699980974,
|
||||
"p95_latency_ms": 14867.712269958853,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13336.124025002122,
|
||||
"p50_latency_ms": 13415.476950019598,
|
||||
"p95_latency_ms": 13514.329230004549,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"stage2_mode_matrix": {
|
||||
"precomputed": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13679.793299987912,
|
||||
"p50_latency_ms": 12918.63379997015,
|
||||
"p95_latency_ms": 16434.964765003322,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"realtime": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13885.101849973202,
|
||||
"p50_latency_ms": 13826.323699980974,
|
||||
"p95_latency_ms": 14867.712269958853,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"static_global_graph": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13336.124025002122,
|
||||
"p50_latency_ms": 13415.476950019598,
|
||||
"p95_latency_ms": 13514.329230004549,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"pairwise_stage2_deltas": [
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "realtime",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": -205.30854998528957
|
||||
},
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 343.66927498579025
|
||||
},
|
||||
{
|
||||
"mode_a": "realtime",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 548.9778249710798
|
||||
}
|
||||
],
|
||||
"config": {
|
||||
"embedding_backend": "fastembed",
|
||||
"embedding_model": "code",
|
||||
"embedding_use_gpu": false,
|
||||
"reranker_backend": "onnx",
|
||||
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"enable_staged_rerank": true,
|
||||
"enable_cross_encoder_rerank": true
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": "executeHybridMode dense_rerank semantic smart_search",
|
||||
"intent": "ccw-semantic-routing",
|
||||
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 38829.27079999447,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 16915.833400011063,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 13961.2567999959,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 12986.330999970436,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "parse CodexLens JSON output strip ANSI smart_search",
|
||||
"intent": "ccw-json-fallback",
|
||||
"notes": "Covers JSON/plain-text fallback handling for CodexLens output.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 14782.901199996471,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\codex-lens-lsp.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\queue\\queueexecuteinsession.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-dashboard\\queuepanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usewebsocket.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useflows.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-error-monitoring.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\tests\\native-session-discovery.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\services\\checkpoint-service.ts",
|
||||
"d:\\claude_dms3\\ccw\\tests\\integration\\system-routes.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 13710.042499959469,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 15027.674999952316,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13389.622500002384,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "smart_search init embed search action schema",
|
||||
"intent": "ccw-action-schema",
|
||||
"notes": "Find the Zod schema that defines init/embed/search actions.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 13661.594299972057,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\discovery.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\__tests__\\ask-question.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\a2ui\\a2uiwebsockethandler.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\dashboard.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 12127.225099980831,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 12860.084999978542,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13441.331400036812,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "auto init missing job dedupe smart_search",
|
||||
"intent": "ccw-auto-init",
|
||||
"notes": "Targets background init/embed warmup and dedupe state.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 13413.994400024414,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\memory-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usememory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\batchoperationtoolbar.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\memory.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useprompthistory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\flowstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\services\\deepwiki-service.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\claude-routes.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 11966.072200000286,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 13691.39059996605,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13527.211199998856,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,415 +0,0 @@
|
||||
{
|
||||
"timestamp": "2026-03-15 00:19:16",
|
||||
"source": "D:\\Claude_dms3",
|
||||
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
|
||||
"query_count": 1,
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"local_only": true,
|
||||
"strategies": {
|
||||
"auto": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 1.0,
|
||||
"mrr_at_k": 1.0,
|
||||
"avg_recall_at_k": 1.0,
|
||||
"avg_latency_ms": 1377.3565999865532,
|
||||
"p50_latency_ms": 1377.3565999865532,
|
||||
"p95_latency_ms": 1377.3565999865532,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"fts": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "auto",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"fts": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 1.0,
|
||||
"mrr_at_k": 1.0,
|
||||
"avg_recall_at_k": 1.0,
|
||||
"avg_latency_ms": 1460.0819000601768,
|
||||
"p50_latency_ms": 1460.0819000601768,
|
||||
"p95_latency_ms": 1460.0819000601768,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"fts": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "fts",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"hybrid": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 45991.74140000343,
|
||||
"p50_latency_ms": 45991.74140000343,
|
||||
"p95_latency_ms": 45991.74140000343,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"hybrid": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "hybrid",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 22739.62610000372,
|
||||
"p50_latency_ms": 22739.62610000372,
|
||||
"p95_latency_ms": 22739.62610000372,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 2.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 1,
|
||||
"effective_methods": {
|
||||
"dense_rerank": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14900.017599999905,
|
||||
"p50_latency_ms": 14900.017599999905,
|
||||
"p95_latency_ms": 14900.017599999905,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"staged:realtime": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14104.314599990845,
|
||||
"p50_latency_ms": 14104.314599990845,
|
||||
"p95_latency_ms": 14104.314599990845,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 11906.852500021458,
|
||||
"p50_latency_ms": 11906.852500021458,
|
||||
"p95_latency_ms": 11906.852500021458,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"stage2_mode_matrix": {
|
||||
"precomputed": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14900.017599999905,
|
||||
"p50_latency_ms": 14900.017599999905,
|
||||
"p95_latency_ms": 14900.017599999905,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"realtime": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14104.314599990845,
|
||||
"p50_latency_ms": 14104.314599990845,
|
||||
"p95_latency_ms": 14104.314599990845,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"static_global_graph": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 11906.852500021458,
|
||||
"p50_latency_ms": 11906.852500021458,
|
||||
"p95_latency_ms": 11906.852500021458,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"pairwise_stage2_deltas": [
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "realtime",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 795.7030000090599
|
||||
},
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 2993.165099978447
|
||||
},
|
||||
{
|
||||
"mode_a": "realtime",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 2197.462099969387
|
||||
}
|
||||
],
|
||||
"config": {
|
||||
"embedding_backend": "fastembed",
|
||||
"embedding_model": "code",
|
||||
"embedding_use_gpu": false,
|
||||
"reranker_backend": "onnx",
|
||||
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"reranker_use_gpu": false,
|
||||
"enable_staged_rerank": true,
|
||||
"enable_cross_encoder_rerank": true
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": "executeHybridMode dense_rerank semantic smart_search",
|
||||
"intent": "ccw-semantic-routing",
|
||||
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"auto": {
|
||||
"strategy_key": "auto",
|
||||
"strategy": "auto",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "fts",
|
||||
"execution_method": "fts",
|
||||
"latency_ms": 1377.3565999865532,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"first_hit_rank": 1,
|
||||
"hit_at_k": true,
|
||||
"recall_at_k": 1.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"fts": {
|
||||
"strategy_key": "fts",
|
||||
"strategy": "fts",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "fts",
|
||||
"execution_method": "fts",
|
||||
"latency_ms": 1460.0819000601768,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"first_hit_rank": 1,
|
||||
"hit_at_k": true,
|
||||
"recall_at_k": 1.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"hybrid": {
|
||||
"strategy_key": "hybrid",
|
||||
"strategy": "hybrid",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "hybrid",
|
||||
"execution_method": "hybrid",
|
||||
"latency_ms": 45991.74140000343,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\config\\litellm-api-config-manager.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\core-memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\scripts\\generate_embeddings.py",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\notification-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\team-msg.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\remote-notification.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-store.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "dense_rerank",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 22739.62610000372,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 2,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 14900.017599999905,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 14104.314599990845,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 11906.852500021458,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,453 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 11:08:47",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.41421235160730957,
|
||||
"avg_rbo_topk": 0.22899068093857142,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 32009.68328570468
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2783.3305999977247
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 40875.45489999652,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 10633.91399383545,
|
||||
"stage2_expand_ms": 12487.980365753174,
|
||||
"stage3_cluster_ms": 10781.587362289429,
|
||||
"stage4_rerank_ms": 6914.837837219238
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 149,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3111.874899983406,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 38541.18510001898,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 548.8920211791992,
|
||||
"stage2_expand_ms": 27176.724433898926,
|
||||
"stage3_cluster_ms": 8352.917671203613,
|
||||
"stage4_rerank_ms": 2392.6541805267334
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 101,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2652.75,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.26666666666666666,
|
||||
"rbo_topk": 0.2983708721671428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 26319.983999997377,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 514.4834518432617,
|
||||
"stage2_expand_ms": 14329.241514205933,
|
||||
"stage3_cluster_ms": 9249.040842056274,
|
||||
"stage4_rerank_ms": 2159.9059104919434
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2666.9745999872684,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.6666666666666666,
|
||||
"rbo_topk": 0.3571430355128571,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 25696.087299972773,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 560.4684352874756,
|
||||
"stage2_expand_ms": 13951.441526412964,
|
||||
"stage3_cluster_ms": 8879.387140274048,
|
||||
"stage4_rerank_ms": 2229.4514179229736
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2544.8630999922752,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.42857142857142855,
|
||||
"rbo_topk": 0.13728894791142857,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 27387.41929998994,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 625.0262260437012,
|
||||
"stage2_expand_ms": 14211.347103118896,
|
||||
"stage3_cluster_ms": 10269.58680152893,
|
||||
"stage4_rerank_ms": 2208.007335662842
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2928.22389999032,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.17647058823529413,
|
||||
"rbo_topk": 0.07116480920571429,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 23732.33979997039,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 504.0884017944336,
|
||||
"stage2_expand_ms": 12899.415016174316,
|
||||
"stage3_cluster_ms": 7881.027936935425,
|
||||
"stage4_rerank_ms": 2372.1535205841064
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2946.439900010824,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.6666666666666666,
|
||||
"rbo_topk": 0.19158624676285715,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 41515.31259998679,
|
||||
"num_results": 9,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 601.7005443572998,
|
||||
"stage2_expand_ms": 30052.319765090942,
|
||||
"stage3_cluster_ms": 8409.791231155396,
|
||||
"stage4_rerank_ms": 2371.1729049682617
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2632.1878000199795,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.5833333333333334,
|
||||
"rbo_topk": 0.4799615561585714,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,356 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:37:28",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12095811211246858,
|
||||
"avg_rbo_topk": 0.09594444061244897,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2471.239057132176
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 3087.217985710927
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 312.2674999535084,
|
||||
"num_results": 37,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2672.6916999816895,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 15344.861499994993,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 81.70747756958008,
|
||||
"stage2_expand_ms": 12762.907266616821,
|
||||
"stage3_cluster_ms": 0.0021457672119140625,
|
||||
"stage4_rerank_ms": 2422.7287769317627
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2908.5530000030994,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 328.4989999830723,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 3426.8526000082493,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 359.32230001688004,
|
||||
"num_results": 11,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 3472.025099992752,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.17647058823529413,
|
||||
"rbo_topk": 0.06801300374142856,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 289.3139999806881,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2859.5299999713898,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 305.66699999570847,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 3101.3711999952793,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 358.74210000038147,
|
||||
"num_results": 4,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 3169.5023000240326,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.2727272727272727,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 4,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,466 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:48:55",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.11418494830148965,
|
||||
"avg_rbo_topk": 0.08910725003591835,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 16443.109000005894
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2919.481471432107
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6056.956700026989,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 113.12270164489746,
|
||||
"stage1_fallback_search_ms": 262.55249977111816,
|
||||
"stage2_expand_ms": 3022.8426456451416,
|
||||
"stage3_cluster_ms": 1.155853271484375,
|
||||
"stage4_rerank_ms": 2554.953098297119
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2788.0383999943733,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.014635885139999999,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 12229.477500021458,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 108.82282257080078,
|
||||
"stage2_expand_ms": 9422.304153442383,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2611.234664916992
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2823.377499997616,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 33805.434699982405,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 100.5556583404541,
|
||||
"stage1_fallback_search_ms": 176.71489715576172,
|
||||
"stage2_expand_ms": 31017.661809921265,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2403.3148288726807
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 5,
|
||||
"stage2_unique_paths": 5,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 5,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 5
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2906.127400010824,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 16790.213800013065,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 110.00967025756836,
|
||||
"stage1_fallback_search_ms": 176.9556999206543,
|
||||
"stage2_expand_ms": 13929.782629013062,
|
||||
"stage3_cluster_ms": 0.45800209045410156,
|
||||
"stage4_rerank_ms": 2486.6883754730225
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 29,
|
||||
"stage2_unique_paths": 14,
|
||||
"stage2_duplicate_paths": 15,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2866.819000005722,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06893318399142857,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 9090.759900003672,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 85.28780937194824,
|
||||
"stage1_fallback_search_ms": 183.7012767791748,
|
||||
"stage2_expand_ms": 5557.527780532837,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 3164.6268367767334
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 3062.4616000056267,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 19777.87659996748,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 65.9482479095459,
|
||||
"stage1_fallback_search_ms": 181.9770336151123,
|
||||
"stage2_expand_ms": 16960.813760757446,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2472.1477031707764
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2854.169200003147,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 17351.04380002618,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 119.1408634185791,
|
||||
"stage1_fallback_search_ms": 246.2625503540039,
|
||||
"stage2_expand_ms": 14137.234449386597,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2750.417470932007
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 11,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 4,
|
||||
"stage3_clustered": 11,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 11
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 3135.3772000074387,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.16767719827714284,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,467 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:56:02",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.11350467619264612,
|
||||
"avg_rbo_topk": 0.09062624799510204,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 8679.35167142323
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 3097.294714289052
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6814.465099990368,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 85.55030822753906,
|
||||
"stage1_fallback_search_ms": 197.95989990234375,
|
||||
"stage2_expand_ms": 3032.4549674987793,
|
||||
"stage3_cluster_ms": 1.1937618255615234,
|
||||
"stage4_rerank_ms": 3402.9476642608643
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3175.0339000225067,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.014635885139999999,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 8990.238099992275,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 90.6367301940918,
|
||||
"stage2_expand_ms": 6272.260665893555,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2531.4290523529053
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 3434.4095999896526,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 9296.205000013113,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 86.64774894714355,
|
||||
"stage1_fallback_search_ms": 163.8650894165039,
|
||||
"stage2_expand_ms": 6144.1497802734375,
|
||||
"stage3_cluster_ms": 0.4100799560546875,
|
||||
"stage4_rerank_ms": 2807.274580001831
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 3043.4417999982834,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 9086.15110000968,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 72.22437858581543,
|
||||
"stage1_fallback_search_ms": 166.3804054260254,
|
||||
"stage2_expand_ms": 6179.303169250488,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2575.9027004241943
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2793.8257000148296,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 8401.927499979734,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 72.67880439758301,
|
||||
"stage1_fallback_search_ms": 166.71442985534668,
|
||||
"stage2_expand_ms": 5561.89489364624,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2517.7178382873535
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 3192.0045999884605,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 9032.269400000572,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 78.59635353088379,
|
||||
"stage1_fallback_search_ms": 180.96280097961426,
|
||||
"stage2_expand_ms": 6175.840377807617,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2503.4260749816895
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 3076.744800001383,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 9134.205499976873,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 117.79379844665527,
|
||||
"stage1_fallback_search_ms": 187.53886222839355,
|
||||
"stage2_expand_ms": 6218.849658966064,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2515.6633853912354
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2965.6026000082493,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,171 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 19:16:45",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 3,
|
||||
"avg_jaccard_topk": 0.07165641376167692,
|
||||
"avg_rbo_topk": 0.10859973275904759,
|
||||
"staged": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 7919.317766676347
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 2812.574933330218
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6351.961700022221,
|
||||
"num_results": 37,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 4424.698300004005,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 17239.81479999423,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 18.40996742248535,
|
||||
"stage2_expand_ms": 16024.681329727173,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 1160.1319313049316
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2086.8772999942303,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 166.1768000125885,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 1926.1491999924183,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,171 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 19:19:13",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 3,
|
||||
"avg_jaccard_topk": 0.07165641376167692,
|
||||
"avg_rbo_topk": 0.10859973275904759,
|
||||
"staged": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 8272.264699995518
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 2753.5123999913535
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6453.665100008249,
|
||||
"num_results": 37,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 4530.146999955177,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 18202.905599981546,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 15.580177307128906,
|
||||
"stage2_expand_ms": 16622.225522994995,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 1516.9692039489746
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 1746.9925000071526,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 160.2233999967575,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 1983.3977000117302,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,453 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 11:26:54",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.39589733329229126,
|
||||
"avg_rbo_topk": 0.23139636799510202,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 32194.107242865222
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2643.366857132741
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 43041.41250002384,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 9864.638805389404,
|
||||
"stage2_expand_ms": 13012.29190826416,
|
||||
"stage3_cluster_ms": 13297.565460205078,
|
||||
"stage4_rerank_ms": 6821.892261505127
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 149,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3209.129799991846,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.05429729885142857,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 37827.209600031376,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 531.8794250488281,
|
||||
"stage2_expand_ms": 27009.481191635132,
|
||||
"stage3_cluster_ms": 7948.509931564331,
|
||||
"stage4_rerank_ms": 2268.9380645751953
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 101,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2540.472400009632,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.26666666666666666,
|
||||
"rbo_topk": 0.2983708721671428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 24744.686599999666,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 517.8542137145996,
|
||||
"stage2_expand_ms": 12839.622735977173,
|
||||
"stage3_cluster_ms": 9154.959678649902,
|
||||
"stage4_rerank_ms": 2160.0701808929443
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2482.5908999741077,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.5384615384615384,
|
||||
"rbo_topk": 0.36639083062285716,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 25239.59050002694,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 631.9081783294678,
|
||||
"stage2_expand_ms": 12570.756196975708,
|
||||
"stage3_cluster_ms": 9557.724952697754,
|
||||
"stage4_rerank_ms": 2409.7683429718018
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2574.1938000023365,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.42857142857142855,
|
||||
"rbo_topk": 0.13728894791142857,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 28572.93939998746,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 659.6193313598633,
|
||||
"stage2_expand_ms": 14207.426309585571,
|
||||
"stage3_cluster_ms": 11513.370037078857,
|
||||
"stage4_rerank_ms": 2117.546319961548
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2536.551799982786,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.17647058823529413,
|
||||
"rbo_topk": 0.07116480920571429,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 23812.726000010967,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 475.42428970336914,
|
||||
"stage2_expand_ms": 12454.935789108276,
|
||||
"stage3_cluster_ms": 8576.019525527954,
|
||||
"stage4_rerank_ms": 2265.360116958618
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2648.7773999869823,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.6666666666666666,
|
||||
"rbo_topk": 0.21230026104857144,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 42120.1860999763,
|
||||
"num_results": 9,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 570.8920955657959,
|
||||
"stage2_expand_ms": 30054.06880378723,
|
||||
"stage3_cluster_ms": 9285.51697731018,
|
||||
"stage4_rerank_ms": 2142.771005630493
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage3_clustered": 20,
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2511.8518999814987,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.5833333333333334,
|
||||
"rbo_topk": 0.4799615561585714,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,208 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 17:27:26",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 3,
|
||||
"avg_jaccard_topk": 0.5809523809523809,
|
||||
"avg_rbo_topk": 0.31359567182809517,
|
||||
"staged": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 22826.711433331173
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 2239.804533312718
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 26690.878500014544,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 8534.121036529541,
|
||||
"stage2_expand_ms": 13298.827648162842,
|
||||
"stage3_cluster_ms": 0.026226043701171875,
|
||||
"stage4_rerank_ms": 4805.774688720703
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 149,
|
||||
"stage2_unique_paths": 43,
|
||||
"stage2_duplicate_paths": 106,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2416.653799980879,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.14285714285714285,
|
||||
"rbo_topk": 0.25764429885142853,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 26188.838399976492,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 525.7587432861328,
|
||||
"stage2_expand_ms": 23659.400939941406,
|
||||
"stage3_cluster_ms": 0.021696090698242188,
|
||||
"stage4_rerank_ms": 1928.950309753418
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 101,
|
||||
"stage2_unique_paths": 23,
|
||||
"stage2_duplicate_paths": 78,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 1953.0992999970913,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.9,
|
||||
"rbo_topk": 0.39374892065285705,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 15600.41740000248,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 475.54636001586914,
|
||||
"stage2_expand_ms": 13318.811893463135,
|
||||
"stage3_cluster_ms": 0.03218650817871094,
|
||||
"stage4_rerank_ms": 1755.7547092437744
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 100,
|
||||
"stage2_expanded": 100,
|
||||
"stage2_unique_paths": 21,
|
||||
"stage2_duplicate_paths": 79,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2349.660499960184,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.7,
|
||||
"rbo_topk": 0.28939379598,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,356 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:36:02",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12095811211246858,
|
||||
"avg_rbo_topk": 0.09594444061244897,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2436.7641000066483
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2593.7630428629263
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 285.091000020504,
|
||||
"num_results": 37,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2412.1290000081062,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 15029.73520001769,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 101.95636749267578,
|
||||
"stage2_expand_ms": 12690.008640289307,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2155.757427215576
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2424.7003000080585,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 324.4240999817848,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2497.174100011587,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 359.32159999012947,
|
||||
"num_results": 11,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2553.8585999906063,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.17647058823529413,
|
||||
"rbo_topk": 0.06801300374142856,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 286.38240000605583,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2570.379099994898,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 412.58780002593994,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2894.3279000222683,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 359.8066000044346,
|
||||
"num_results": 4,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2803.772300004959,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.2727272727272727,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 4,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,462 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:45:10",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.1283498247783962,
|
||||
"avg_rbo_topk": 0.09664773770897958,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 16394.152085712976
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2839.464457145759
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6233.342700004578,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 125.80323219299316,
|
||||
"stage1_fallback_search_ms": 277.1914005279541,
|
||||
"stage2_expand_ms": 3032.3121547698975,
|
||||
"stage3_cluster_ms": 0.02765655517578125,
|
||||
"stage4_rerank_ms": 2699.3532180786133
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3036.3474999964237,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 12703.503900021315,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 83.4202766418457,
|
||||
"stage2_expand_ms": 9856.60433769226,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2664.630174636841
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2888.501700013876,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 33684.76710000634,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 78.8118839263916,
|
||||
"stage1_fallback_search_ms": 174.6652126312256,
|
||||
"stage2_expand_ms": 31018.909692764282,
|
||||
"stage3_cluster_ms": 0.0016689300537109375,
|
||||
"stage4_rerank_ms": 2316.9021606445312
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 5,
|
||||
"stage2_unique_paths": 5,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 5,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 5
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2824.729699999094,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 16910.090099990368,
|
||||
"num_results": 8,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 99.6243953704834,
|
||||
"stage1_fallback_search_ms": 207.89742469787598,
|
||||
"stage2_expand_ms": 13929.257154464722,
|
||||
"stage3_cluster_ms": 0.016927719116210938,
|
||||
"stage4_rerank_ms": 2586.843729019165
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 29,
|
||||
"stage2_unique_paths": 14,
|
||||
"stage2_duplicate_paths": 15,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2765.958099991083,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.06893318399142857,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 6,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 8380.20839998126,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 95.42632102966309,
|
||||
"stage1_fallback_search_ms": 187.4692440032959,
|
||||
"stage2_expand_ms": 5561.658143997192,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2441.287040710449
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2788.0665000081062,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 19897.71709999442,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 114.1653060913086,
|
||||
"stage1_fallback_search_ms": 235.73827743530273,
|
||||
"stage2_expand_ms": 16702.077865600586,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2757.4093341827393
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2874.178600013256,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 16949.43529999256,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 104.50935363769531,
|
||||
"stage1_fallback_search_ms": 190.6723976135254,
|
||||
"stage2_expand_ms": 14165.841102600098,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2399.226188659668
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 11,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 4,
|
||||
"stage3_clustered": 11,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 11
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2698.469099998474,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.16767719827714284,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 20:53:01",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12384302205730777,
|
||||
"avg_rbo_topk": 0.09816673566816325,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 8696.564499999795
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2936.2583857136115
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 6108.304299980402,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 90.47985076904297,
|
||||
"stage1_fallback_search_ms": 224.38788414001465,
|
||||
"stage2_expand_ms": 3031.7258834838867,
|
||||
"stage3_cluster_ms": 0.02956390380859375,
|
||||
"stage4_rerank_ms": 2655.31849861145
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2873.6466999948025,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 9321.754200011492,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 140.43283462524414,
|
||||
"stage2_expand_ms": 6410.467863082886,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2675.7972240448
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 3104.7773999869823,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 9527.073799997568,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 98.59919548034668,
|
||||
"stage1_fallback_search_ms": 172.26457595825195,
|
||||
"stage2_expand_ms": 6125.282049179077,
|
||||
"stage3_cluster_ms": 0.017404556274414062,
|
||||
"stage4_rerank_ms": 3023.9248275756836
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2901.0302999913692,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 9120.886200010777,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 91.48454666137695,
|
||||
"stage1_fallback_search_ms": 172.12390899658203,
|
||||
"stage2_expand_ms": 6166.24903678894,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2601.947546005249
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2847.6964999735355,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 8424.535699993372,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 92.8945541381836,
|
||||
"stage1_fallback_search_ms": 192.06547737121582,
|
||||
"stage2_expand_ms": 5568.126440048218,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2480.673313140869
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2974.9999000132084,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 9253.624700009823,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 102.18691825866699,
|
||||
"stage1_fallback_search_ms": 176.97691917419434,
|
||||
"stage2_expand_ms": 6113.626480102539,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2774.4452953338623
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2860.619900047779,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 9119.772599995136,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 90.18850326538086,
|
||||
"stage1_fallback_search_ms": 157.95397758483887,
|
||||
"stage2_expand_ms": 6293.469429016113,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2486.8383407592773
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2991.0379999876022,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-10 12:23:36",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12384302205730777,
|
||||
"avg_rbo_topk": 0.09816673566816325,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 3996.4113285754406
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2780.485200004918
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2365.3048999905586,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 25.228023529052734,
|
||||
"stage1_fallback_search_ms": 206.0999870300293,
|
||||
"stage2_expand_ms": 16.644954681396484,
|
||||
"stage3_cluster_ms": 0.025987625122070312,
|
||||
"stage4_rerank_ms": 2064.2504692077637
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2610.047899991274,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 3723.305599987507,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 31.742334365844727,
|
||||
"stage2_expand_ms": 2125.1025199890137,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 1511.4071369171143
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2072.4792000055313,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 5251.151299983263,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 32.721757888793945,
|
||||
"stage1_fallback_search_ms": 195.51420211791992,
|
||||
"stage2_expand_ms": 2060.0733757019043,
|
||||
"stage3_cluster_ms": 0.0095367431640625,
|
||||
"stage4_rerank_ms": 2900.8395671844482
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 1972.8982000350952,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4101.171400010586,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 29.141902923583984,
|
||||
"stage1_fallback_search_ms": 234.2982292175293,
|
||||
"stage2_expand_ms": 2082.4878215789795,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 1698.7183094024658
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2331.9747000038624,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 4032.0041000247,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 42.098283767700195,
|
||||
"stage1_fallback_search_ms": 209.6574306488037,
|
||||
"stage2_expand_ms": 2053.9097785949707,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 1665.3883457183838
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2026.5661999881268,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 4237.893900036812,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 64.01538848876953,
|
||||
"stage1_fallback_search_ms": 225.14033317565918,
|
||||
"stage2_expand_ms": 2116.3012981414795,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 1776.0803699493408
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2125.935900002718,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 4264.048099994659,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 31.972646713256836,
|
||||
"stage1_fallback_search_ms": 235.47840118408203,
|
||||
"stage2_expand_ms": 2161.5889072418213,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 1768.0847644805908
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 6323.49430000782,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,467 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-10 12:46:47",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.11350467619264612,
|
||||
"avg_rbo_topk": 0.09062624799510204,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 5670.9065000244545
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 3047.475757143327
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2971.5892000496387,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 108.11758041381836,
|
||||
"stage1_fallback_search_ms": 230.96132278442383,
|
||||
"stage2_expand_ms": 18.60976219177246,
|
||||
"stage3_cluster_ms": 1.100301742553711,
|
||||
"stage4_rerank_ms": 2528.761625289917
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2937.113800019026,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.014635885139999999,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 10065.153400033712,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 127.17461585998535,
|
||||
"stage2_expand_ms": 7361.833810806274,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2472.7542400360107
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 3059.5018000006676,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 5557.314100056887,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 133.9263916015625,
|
||||
"stage1_fallback_search_ms": 242.1243190765381,
|
||||
"stage2_expand_ms": 2106.602430343628,
|
||||
"stage3_cluster_ms": 0.47016143798828125,
|
||||
"stage4_rerank_ms": 2967.3829078674316
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 3157.7918999791145,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 5458.670999974012,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 113.62957954406738,
|
||||
"stage1_fallback_search_ms": 204.56886291503906,
|
||||
"stage2_expand_ms": 2166.4509773254395,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2872.969627380371
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2896.5341999828815,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 5028.861099988222,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 111.71293258666992,
|
||||
"stage1_fallback_search_ms": 192.02208518981934,
|
||||
"stage2_expand_ms": 2054.065465927124,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2579.0507793426514
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 3627.1755999922752,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 5114.356300055981,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 135.76626777648926,
|
||||
"stage1_fallback_search_ms": 211.12942695617676,
|
||||
"stage2_expand_ms": 2151.059150695801,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2519.892692565918
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2853.594000041485,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 5500.400400012732,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 96.66872024536133,
|
||||
"stage1_fallback_search_ms": 176.37205123901367,
|
||||
"stage2_expand_ms": 2137.751340866089,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2991.840124130249
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2800.6189999878407,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-10 12:52:44",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.13455730777159347,
|
||||
"avg_rbo_topk": 0.10274807844326529,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 4445.262371412346
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 3327.1750857276575
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2719.7998999655247,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 33.12373161315918,
|
||||
"stage1_fallback_search_ms": 230.31878471374512,
|
||||
"stage2_expand_ms": 22.444486618041992,
|
||||
"stage3_cluster_ms": 0.06079673767089844,
|
||||
"stage4_rerank_ms": 2338.5443687438965
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2334.8668000102043,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.2,
|
||||
"rbo_topk": 0.09948869827714285,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 4470.056899994612,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 28.5646915435791,
|
||||
"stage2_expand_ms": 2216.57133102417,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2131.246566772461
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2447.341199964285,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 6126.65680000186,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 25.135278701782227,
|
||||
"stage1_fallback_search_ms": 171.53453826904297,
|
||||
"stage2_expand_ms": 2094.9013233184814,
|
||||
"stage3_cluster_ms": 0.024318695068359375,
|
||||
"stage4_rerank_ms": 3743.204355239868
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 11,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 11
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 9015.508300036192,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4319.597599953413,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 18.799781799316406,
|
||||
"stage1_fallback_search_ms": 167.36602783203125,
|
||||
"stage2_expand_ms": 2101.4957427978516,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 1976.8805503845215
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2356.994699984789,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 4574.691199988127,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 45.72629928588867,
|
||||
"stage1_fallback_search_ms": 233.0036163330078,
|
||||
"stage2_expand_ms": 2068.8536167144775,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2152.9064178466797
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2311.4787000119686,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 4616.5374999940395,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 38.83004188537598,
|
||||
"stage1_fallback_search_ms": 263.0441188812256,
|
||||
"stage2_expand_ms": 2070.7976818084717,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2133.629083633423
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2337.4413000643253,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 4289.496699988842,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 34.40546989440918,
|
||||
"stage1_fallback_search_ms": 231.8587303161621,
|
||||
"stage2_expand_ms": 2068.8445568084717,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 1850.6083488464355
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2486.594600021839,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-10 12:44:24",
|
||||
"source": "src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12384302205730777,
|
||||
"avg_rbo_topk": 0.09816673566816325,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 4603.035771421024
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2776.139728575945
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3544.4309000074863,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 34.082651138305664,
|
||||
"stage1_fallback_search_ms": 217.52095222473145,
|
||||
"stage2_expand_ms": 18.847942352294922,
|
||||
"stage3_cluster_ms": 0.031948089599609375,
|
||||
"stage4_rerank_ms": 3176.4564514160156
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3075.5329999923706,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 4371.493600010872,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 29.517173767089844,
|
||||
"stage2_expand_ms": 2236.224412918091,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 1998.866319656372
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2334.758200019598,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 4143.470999985933,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 20.66636085510254,
|
||||
"stage1_fallback_search_ms": 150.6054401397705,
|
||||
"stage2_expand_ms": 2064.2361640930176,
|
||||
"stage3_cluster_ms": 0.012159347534179688,
|
||||
"stage4_rerank_ms": 1838.1483554840088
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2207.86700001359,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4234.638899981976,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 21.48127555847168,
|
||||
"stage1_fallback_search_ms": 153.59735488891602,
|
||||
"stage2_expand_ms": 2092.521905899048,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 1876.7595291137695
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2646.9266000390053,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 4778.165899991989,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 18.590688705444336,
|
||||
"stage1_fallback_search_ms": 195.90282440185547,
|
||||
"stage2_expand_ms": 2053.685426712036,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2431.095838546753
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2887.1304000020027,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 5823.889799982309,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 109.02619361877441,
|
||||
"stage1_fallback_search_ms": 196.54059410095215,
|
||||
"stage2_expand_ms": 2088.4640216827393,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 3328.0465602874756
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 3351.872999995947,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 5325.160299986601,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 216.71128273010254,
|
||||
"stage1_fallback_search_ms": 295.27878761291504,
|
||||
"stage2_expand_ms": 2091.4883613586426,
|
||||
"stage3_cluster_ms": 0.001430511474609375,
|
||||
"stage4_rerank_ms": 2606.9161891937256
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2928.889899969101,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,467 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-11 15:16:08",
|
||||
"source": "codex-lens\\src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.11350467619264612,
|
||||
"avg_rbo_topk": 0.09062624799510204,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 4507.475014303412
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2537.8563000304357
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2474.800100028515,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 91.76826477050781,
|
||||
"stage1_fallback_search_ms": 162.45269775390625,
|
||||
"stage2_expand_ms": 14.957904815673828,
|
||||
"stage3_cluster_ms": 0.8461475372314453,
|
||||
"stage4_rerank_ms": 2129.7342777252197
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2425.3046000003815,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.05263157894736842,
|
||||
"rbo_topk": 0.014635885139999999,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 8,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 5389.070900022984,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 63.6446475982666,
|
||||
"stage2_expand_ms": 3202.108144760132,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2011.8708610534668
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2465.9148000478745,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 4989.407700002193,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 88.54341506958008,
|
||||
"stage1_fallback_search_ms": 125.9164810180664,
|
||||
"stage2_expand_ms": 2063.6398792266846,
|
||||
"stage3_cluster_ms": 0.3476142883300781,
|
||||
"stage4_rerank_ms": 2633.7506771087646
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2424.8579000234604,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4771.1614000201225,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 61.426401138305664,
|
||||
"stage1_fallback_search_ms": 152.01711654663086,
|
||||
"stage2_expand_ms": 2078.4833431243896,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2376.2998580932617
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2418.981700003147,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 4559.269900023937,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 60.93573570251465,
|
||||
"stage1_fallback_search_ms": 141.4163112640381,
|
||||
"stage2_expand_ms": 2032.2721004486084,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2217.2317504882812
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2443.3700000047684,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 4757.269500017166,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 89.56503868103027,
|
||||
"stage1_fallback_search_ms": 143.58854293823242,
|
||||
"stage2_expand_ms": 2119.623899459839,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2303.9650917053223
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2431.0521000623703,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 4611.3456000089645,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 74.86128807067871,
|
||||
"stage1_fallback_search_ms": 137.465238571167,
|
||||
"stage2_expand_ms": 2086.426019668579,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2218.2157039642334
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "dir_rr",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 3155.5130000710487,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-11 15:12:41",
|
||||
"source": "codex-lens\\src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.13455730777159347,
|
||||
"avg_rbo_topk": 0.10274807844326529,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 4532.43382857527
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2712.3431142909185
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2704.6869000196457,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 56.32758140563965,
|
||||
"stage1_fallback_search_ms": 156.8472385406494,
|
||||
"stage2_expand_ms": 15.436887741088867,
|
||||
"stage3_cluster_ms": 0.04291534423828125,
|
||||
"stage4_rerank_ms": 2388.756513595581
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 3257.856599986553,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.2,
|
||||
"rbo_topk": 0.09948869827714285,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 4347.2081000208855,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 65.37723541259766,
|
||||
"stage2_expand_ms": 2145.587682723999,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2052.9236793518066
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2642.404200077057,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 4627.254400074482,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 96.67634963989258,
|
||||
"stage1_fallback_search_ms": 162.25123405456543,
|
||||
"stage2_expand_ms": 2071.5224742889404,
|
||||
"stage3_cluster_ms": 0.018835067749023438,
|
||||
"stage4_rerank_ms": 2211.8191719055176
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 11,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 11
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2479.5284999608994,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4663.639899969101,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 82.36384391784668,
|
||||
"stage1_fallback_search_ms": 158.2353115081787,
|
||||
"stage2_expand_ms": 2087.8846645355225,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2249.4378089904785
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2455.024599969387,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 6402.90189999342,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 44.295310974121094,
|
||||
"stage1_fallback_search_ms": 127.30145454406738,
|
||||
"stage2_expand_ms": 2030.930995941162,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 4132.822036743164
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 3286.4142000079155,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 4532.2757999897,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 85.02960205078125,
|
||||
"stage1_fallback_search_ms": 146.46339416503906,
|
||||
"stage2_expand_ms": 2071.5532302856445,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2140.7644748687744
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2349.7827999591827,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 4449.06979995966,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 67.15631484985352,
|
||||
"stage1_fallback_search_ms": 148.30541610717773,
|
||||
"stage2_expand_ms": 2069.3678855895996,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2097.882032394409
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "path",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2515.3909000754356,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-11 15:14:25",
|
||||
"source": "codex-lens\\src",
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"query_count": 7,
|
||||
"avg_jaccard_topk": 0.12384302205730777,
|
||||
"avg_rbo_topk": 0.09816673566816325,
|
||||
"staged": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 4538.7477714674815
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 7,
|
||||
"avg_latency_ms": 2568.1517999768257
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2546.395000040531,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 70.5413818359375,
|
||||
"stage1_fallback_search_ms": 165.39907455444336,
|
||||
"stage2_expand_ms": 15.58542251586914,
|
||||
"stage3_cluster_ms": 0.020265579223632812,
|
||||
"stage4_rerank_ms": 2209.89727973938
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 37,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 86,
|
||||
"stage2_unique_paths": 53,
|
||||
"stage2_duplicate_paths": 33,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 2610.328099966049,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.06741929885142856,
|
||||
"staged_unique_files_topk": 8,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 4569.872200012207,
|
||||
"num_results": 3,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 96.31776809692383,
|
||||
"stage2_expand_ms": 2299.86310005188,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2094.2182540893555
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 3,
|
||||
"stage2_expanded": 4,
|
||||
"stage2_unique_paths": 3,
|
||||
"stage2_duplicate_paths": 1,
|
||||
"stage3_clustered": 4,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 4
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 2509.9732999801636,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.09090909090909091,
|
||||
"rbo_topk": 0.23541639942571424,
|
||||
"staged_unique_files_topk": 2,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 5064.990800082684,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 86.1806869506836,
|
||||
"stage1_fallback_search_ms": 150.21824836730957,
|
||||
"stage2_expand_ms": 2080.6803703308105,
|
||||
"stage3_cluster_ms": 0.011682510375976562,
|
||||
"stage4_rerank_ms": 2663.7954711914062
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 31,
|
||||
"stage2_unique_paths": 11,
|
||||
"stage2_duplicate_paths": 20,
|
||||
"stage3_clustered": 20,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 20
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 2778.6906000375748,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.06666666666666667,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 6,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 2,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "graph expansion",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 4816.586899995804,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 79.48184013366699,
|
||||
"stage1_fallback_search_ms": 158.03027153015137,
|
||||
"stage2_expand_ms": 2087.271213531494,
|
||||
"stage3_cluster_ms": 0.0007152557373046875,
|
||||
"stage4_rerank_ms": 2410.567283630371
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 11,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 16,
|
||||
"stage2_unique_paths": 13,
|
||||
"stage2_duplicate_paths": 3,
|
||||
"stage3_clustered": 16,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 16
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "graph expansion",
|
||||
"latency_ms": 2692.1504999399185,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1875,
|
||||
"rbo_topk": 0.06134116970571428,
|
||||
"staged_unique_files_topk": 9,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 7,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "clustering strategy",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 4494.9805000424385,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 40.569305419921875,
|
||||
"stage1_fallback_search_ms": 141.06035232543945,
|
||||
"stage2_expand_ms": 2043.9364910125732,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 2198.4200477600098
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 10,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 10,
|
||||
"stage2_unique_paths": 10,
|
||||
"stage2_duplicate_paths": 0,
|
||||
"stage3_clustered": 10,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "clustering strategy",
|
||||
"latency_ms": 2474.2726999521255,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.04670528456571428,
|
||||
"staged_unique_files_topk": 10,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "error handling",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "error handling",
|
||||
"latency_ms": 5652.523400068283,
|
||||
"num_results": 6,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 87.34393119812012,
|
||||
"stage1_fallback_search_ms": 149.7325897216797,
|
||||
"stage2_expand_ms": 2072.728157043457,
|
||||
"stage3_cluster_ms": 0.00095367431640625,
|
||||
"stage4_rerank_ms": 3190.687894821167
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 5,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 13,
|
||||
"stage2_unique_paths": 6,
|
||||
"stage2_duplicate_paths": 7,
|
||||
"stage3_clustered": 13,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 13
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "error handling",
|
||||
"latency_ms": 2481.709800004959,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.07142857142857142,
|
||||
"rbo_topk": 0.045191399425714276,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 4
|
||||
},
|
||||
{
|
||||
"query": "how to parse json",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 4625.885600030422,
|
||||
"num_results": 7,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 92.83590316772461,
|
||||
"stage1_fallback_search_ms": 147.12858200073242,
|
||||
"stage2_expand_ms": 2061.2568855285645,
|
||||
"stage3_cluster_ms": 0.0011920928955078125,
|
||||
"stage4_rerank_ms": 2246.800184249878
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 4,
|
||||
"stage1_fallback_used": 1,
|
||||
"stage2_expanded": 9,
|
||||
"stage2_unique_paths": 7,
|
||||
"stage2_duplicate_paths": 2,
|
||||
"stage3_clustered": 9,
|
||||
"stage3_strategy": "score",
|
||||
"stage4_reranked": 9
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 2429.9375999569893,
|
||||
"num_results": 10,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.21428571428571427,
|
||||
"rbo_topk": 0.18590219827714285,
|
||||
"staged_unique_files_topk": 7,
|
||||
"dense_unique_files_topk": 10,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,406 +0,0 @@
|
||||
{
|
||||
"storage_analysis": {
|
||||
"tables": {
|
||||
"code_relationships": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"source_symbol_id",
|
||||
"target_qualified_name",
|
||||
"relationship_type",
|
||||
"source_line",
|
||||
"target_file"
|
||||
]
|
||||
},
|
||||
"embeddings_config": {
|
||||
"row_count": 1,
|
||||
"columns": [
|
||||
"id",
|
||||
"model_profile",
|
||||
"model_name",
|
||||
"embedding_dim",
|
||||
"backend",
|
||||
"created_at",
|
||||
"updated_at"
|
||||
]
|
||||
},
|
||||
"file_keywords": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"file_id",
|
||||
"keyword_id"
|
||||
]
|
||||
},
|
||||
"files": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"name",
|
||||
"full_path",
|
||||
"language",
|
||||
"content",
|
||||
"mtime",
|
||||
"line_count"
|
||||
]
|
||||
},
|
||||
"files_fts_exact": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"name",
|
||||
"full_path",
|
||||
"content"
|
||||
]
|
||||
},
|
||||
"files_fts_exact_config": {
|
||||
"row_count": 1,
|
||||
"columns": [
|
||||
"k",
|
||||
"v"
|
||||
]
|
||||
},
|
||||
"files_fts_exact_data": {
|
||||
"row_count": 2,
|
||||
"columns": [
|
||||
"id",
|
||||
"block"
|
||||
]
|
||||
},
|
||||
"files_fts_exact_docsize": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"sz"
|
||||
]
|
||||
},
|
||||
"files_fts_exact_idx": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"segid",
|
||||
"term",
|
||||
"pgno"
|
||||
]
|
||||
},
|
||||
"files_fts_fuzzy": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"name",
|
||||
"full_path",
|
||||
"content"
|
||||
]
|
||||
},
|
||||
"files_fts_fuzzy_config": {
|
||||
"row_count": 1,
|
||||
"columns": [
|
||||
"k",
|
||||
"v"
|
||||
]
|
||||
},
|
||||
"files_fts_fuzzy_data": {
|
||||
"row_count": 2,
|
||||
"columns": [
|
||||
"id",
|
||||
"block"
|
||||
]
|
||||
},
|
||||
"files_fts_fuzzy_docsize": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"sz"
|
||||
]
|
||||
},
|
||||
"files_fts_fuzzy_idx": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"segid",
|
||||
"term",
|
||||
"pgno"
|
||||
]
|
||||
},
|
||||
"graph_neighbors": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"source_symbol_id",
|
||||
"neighbor_symbol_id",
|
||||
"relationship_depth"
|
||||
]
|
||||
},
|
||||
"keywords": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"keyword"
|
||||
]
|
||||
},
|
||||
"merkle_hashes": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"file_id",
|
||||
"sha256",
|
||||
"updated_at"
|
||||
]
|
||||
},
|
||||
"merkle_state": {
|
||||
"row_count": 1,
|
||||
"columns": [
|
||||
"id",
|
||||
"root_hash",
|
||||
"updated_at"
|
||||
]
|
||||
},
|
||||
"semantic_chunks": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"file_path",
|
||||
"content",
|
||||
"embedding",
|
||||
"metadata",
|
||||
"created_at",
|
||||
"embedding_binary",
|
||||
"embedding_dense"
|
||||
]
|
||||
},
|
||||
"semantic_metadata": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"file_id",
|
||||
"summary",
|
||||
"purpose",
|
||||
"llm_tool",
|
||||
"generated_at"
|
||||
]
|
||||
},
|
||||
"sqlite_sequence": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"name",
|
||||
"seq"
|
||||
]
|
||||
},
|
||||
"subdirs": {
|
||||
"row_count": 2,
|
||||
"columns": [
|
||||
"id",
|
||||
"name",
|
||||
"index_path",
|
||||
"files_count",
|
||||
"last_updated"
|
||||
]
|
||||
},
|
||||
"symbols": {
|
||||
"row_count": 0,
|
||||
"columns": [
|
||||
"id",
|
||||
"file_id",
|
||||
"name",
|
||||
"kind",
|
||||
"start_line",
|
||||
"end_line"
|
||||
]
|
||||
}
|
||||
},
|
||||
"conflicts": [],
|
||||
"recommendations": [
|
||||
"Found 10 FTS tables: ['files_fts_exact', 'files_fts_exact_config', 'files_fts_exact_data', 'files_fts_exact_docsize', 'files_fts_exact_idx', 'files_fts_fuzzy', 'files_fts_fuzzy_config', 'files_fts_fuzzy_data', 'files_fts_fuzzy_docsize', 'files_fts_fuzzy_idx']. Dual FTS (exact + fuzzy) is properly configured."
|
||||
]
|
||||
},
|
||||
"contribution_analysis": {
|
||||
"per_query": [
|
||||
{
|
||||
"query": "binary quantization",
|
||||
"methods": {
|
||||
"fts_exact": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"vector": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"splade": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
}
|
||||
},
|
||||
"fusion_analysis": {},
|
||||
"overlaps": {}
|
||||
},
|
||||
{
|
||||
"query": "hamming distance search",
|
||||
"methods": {
|
||||
"fts_exact": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"vector": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"splade": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
}
|
||||
},
|
||||
"fusion_analysis": {},
|
||||
"overlaps": {}
|
||||
},
|
||||
{
|
||||
"query": "embeddings generation",
|
||||
"methods": {
|
||||
"fts_exact": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"vector": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"splade": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
}
|
||||
},
|
||||
"fusion_analysis": {},
|
||||
"overlaps": {}
|
||||
},
|
||||
{
|
||||
"query": "reranking algorithm",
|
||||
"methods": {
|
||||
"fts_exact": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"vector": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"splade": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
}
|
||||
},
|
||||
"fusion_analysis": {},
|
||||
"overlaps": {}
|
||||
},
|
||||
{
|
||||
"query": "database connection handling",
|
||||
"methods": {
|
||||
"fts_exact": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"vector": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
},
|
||||
"splade": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||
"count": 0
|
||||
}
|
||||
},
|
||||
"fusion_analysis": {},
|
||||
"overlaps": {}
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"fts_exact": {
|
||||
"avg_count": 0.0,
|
||||
"avg_latency_ms": 0
|
||||
},
|
||||
"fts_fuzzy": {
|
||||
"avg_count": 0.0,
|
||||
"avg_latency_ms": 0
|
||||
},
|
||||
"vector": {
|
||||
"avg_count": 0.0,
|
||||
"avg_latency_ms": 0
|
||||
},
|
||||
"splade": {
|
||||
"avg_count": 0.0,
|
||||
"avg_latency_ms": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"fusion_experiment": {
|
||||
"per_query": [
|
||||
{
|
||||
"query": "binary quantization",
|
||||
"strategies": {
|
||||
"standard_hybrid": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
},
|
||||
"fts_rerank_fusion": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "hamming distance search",
|
||||
"strategies": {
|
||||
"standard_hybrid": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
},
|
||||
"fts_rerank_fusion": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "embeddings generation",
|
||||
"strategies": {
|
||||
"standard_hybrid": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
},
|
||||
"fts_rerank_fusion": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "reranking algorithm",
|
||||
"strategies": {
|
||||
"standard_hybrid": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
},
|
||||
"fts_rerank_fusion": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "database connection handling",
|
||||
"strategies": {
|
||||
"standard_hybrid": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
},
|
||||
"fts_rerank_fusion": {
|
||||
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"summary": {}
|
||||
}
|
||||
}
|
||||
@@ -1,73 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-08 23:48:26",
|
||||
"source": "src",
|
||||
"k": 5,
|
||||
"coarse_k": 50,
|
||||
"query_count": 1,
|
||||
"avg_jaccard_topk": 0.0,
|
||||
"avg_rbo_topk": 0.0,
|
||||
"staged": {
|
||||
"success": 1,
|
||||
"avg_latency_ms": 30093.97499999404
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 1,
|
||||
"avg_latency_ms": 331.4424999952316
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 30093.97499999404,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 6421.706914901733,
|
||||
"stage2_expand_ms": 17591.988563537598,
|
||||
"stage3_cluster_ms": 3700.4549503326416,
|
||||
"stage4_rerank_ms": 2340.064525604248
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 99,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 331.4424999952316,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.0,
|
||||
"rbo_topk": 0.0,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,177 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-08 23:58:56",
|
||||
"source": "src",
|
||||
"k": 5,
|
||||
"coarse_k": 50,
|
||||
"query_count": 3,
|
||||
"avg_jaccard_topk": 0.11574074074074074,
|
||||
"avg_rbo_topk": 0.14601366666666662,
|
||||
"staged": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 27868.044033328693
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 1339.25289999942
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 33643.06179998815,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 6201.4524936676025,
|
||||
"stage2_expand_ms": 17306.61702156067,
|
||||
"stage3_cluster_ms": 6829.557418823242,
|
||||
"stage4_rerank_ms": 3267.071485519409
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 99,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 1520.9955999851227,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.031347,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 1
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 26400.58900000155,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 404.60920333862305,
|
||||
"stage2_expand_ms": 20036.258697509766,
|
||||
"stage3_cluster_ms": 4919.439315795898,
|
||||
"stage4_rerank_ms": 1001.8632411956787
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 51,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 1264.3862999975681,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.20334699999999994,
|
||||
"staged_unique_files_topk": 4,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 2
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 23560.481299996376,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 385.28990745544434,
|
||||
"stage2_expand_ms": 17787.648677825928,
|
||||
"stage3_cluster_ms": 4374.642372131348,
|
||||
"stage4_rerank_ms": 974.8115539550781
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 50,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 1232.3768000155687,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.20334699999999994,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,176 +0,0 @@
|
||||
{
|
||||
"summary": {
|
||||
"timestamp": "2026-02-09 00:08:47",
|
||||
"source": "src",
|
||||
"k": 5,
|
||||
"coarse_k": 50,
|
||||
"query_count": 3,
|
||||
"avg_jaccard_topk": 0.11574074074074074,
|
||||
"avg_rbo_topk": 0.14601366666666662,
|
||||
"staged": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 31720.555866663653
|
||||
},
|
||||
"dense_rerank": {
|
||||
"success": 3,
|
||||
"avg_latency_ms": 1401.2113333245118
|
||||
}
|
||||
},
|
||||
"comparisons": [
|
||||
{
|
||||
"query": "class Config",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "class Config",
|
||||
"latency_ms": 40162.88519999385,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 6091.366767883301,
|
||||
"stage2_expand_ms": 17540.942907333374,
|
||||
"stage3_cluster_ms": 13169.558048248291,
|
||||
"stage4_rerank_ms": 3317.5392150878906
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 99,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "class Config",
|
||||
"latency_ms": 1571.1398999989033,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.031347,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 5,
|
||||
"dense_unique_dirs_topk": 1
|
||||
},
|
||||
{
|
||||
"query": "def search",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "def search",
|
||||
"latency_ms": 31623.380899995565,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 400.84290504455566,
|
||||
"stage2_expand_ms": 20529.58631515503,
|
||||
"stage3_cluster_ms": 9625.348806381226,
|
||||
"stage4_rerank_ms": 1027.686357498169
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 51,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "def search",
|
||||
"latency_ms": 1376.3304999768734,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.125,
|
||||
"rbo_topk": 0.20334699999999994,
|
||||
"staged_unique_files_topk": 4,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 3,
|
||||
"dense_unique_dirs_topk": 2
|
||||
},
|
||||
{
|
||||
"query": "LspBridge",
|
||||
"staged": {
|
||||
"strategy": "staged",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 23375.40150000155,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
|
||||
],
|
||||
"stage_stats": {
|
||||
"stage_times": {
|
||||
"stage1_binary_ms": 392.41671562194824,
|
||||
"stage2_expand_ms": 17760.897397994995,
|
||||
"stage3_cluster_ms": 4194.235563278198,
|
||||
"stage4_rerank_ms": 990.307092666626
|
||||
},
|
||||
"stage_counts": {
|
||||
"stage1_candidates": 50,
|
||||
"stage2_expanded": 50,
|
||||
"stage3_clustered": 10,
|
||||
"stage4_reranked": 10
|
||||
}
|
||||
},
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy": "dense_rerank",
|
||||
"query": "LspBridge",
|
||||
"latency_ms": 1256.1635999977589,
|
||||
"num_results": 5,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
|
||||
],
|
||||
"stage_stats": null,
|
||||
"error": null
|
||||
},
|
||||
"jaccard_topk": 0.1111111111111111,
|
||||
"rbo_topk": 0.20334699999999994,
|
||||
"staged_unique_files_topk": 5,
|
||||
"dense_unique_files_topk": 5,
|
||||
"staged_unique_dirs_topk": 4,
|
||||
"dense_unique_dirs_topk": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,465 +0,0 @@
|
||||
"""
|
||||
CoIR Benchmark Evaluation Report Generator
|
||||
|
||||
Compares SPLADE with mainstream code retrieval models on CoIR benchmark tasks.
|
||||
Generates comprehensive performance analysis report.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Tuple
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
# =============================================================================
|
||||
# REFERENCE: Published CoIR Benchmark Scores (NDCG@10)
|
||||
# Source: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883
|
||||
# =============================================================================
|
||||
|
||||
COIR_REFERENCE_SCORES = {
|
||||
# Model: {dataset: NDCG@10 score}
|
||||
"Voyage-Code-002": {
|
||||
"APPS": 26.52, "CosQA": 29.79, "Text2SQL": 69.26, "CodeSearchNet": 81.79,
|
||||
"CCR": 73.45, "Contest-DL": 72.77, "StackOverflow": 27.28,
|
||||
"FB-ST": 87.68, "FB-MT": 65.35, "Average": 56.26
|
||||
},
|
||||
"E5-Mistral-7B": {
|
||||
"APPS": 21.33, "CosQA": 31.27, "Text2SQL": 65.98, "CodeSearchNet": 54.25,
|
||||
"CCR": 65.27, "Contest-DL": 82.55, "StackOverflow": 33.24,
|
||||
"FB-ST": 91.54, "FB-MT": 72.71, "Average": 55.18
|
||||
},
|
||||
"E5-Base": {
|
||||
"APPS": 11.52, "CosQA": 32.59, "Text2SQL": 52.31, "CodeSearchNet": 67.99,
|
||||
"CCR": 56.87, "Contest-DL": 62.50, "StackOverflow": 21.87,
|
||||
"FB-ST": 86.86, "FB-MT": 74.52, "Average": 50.90
|
||||
},
|
||||
"OpenAI-Ada-002": {
|
||||
"APPS": 8.70, "CosQA": 28.88, "Text2SQL": 58.32, "CodeSearchNet": 74.21,
|
||||
"CCR": 69.13, "Contest-DL": 53.34, "StackOverflow": 26.04,
|
||||
"FB-ST": 72.40, "FB-MT": 47.12, "Average": 45.59
|
||||
},
|
||||
"BGE-Base": {
|
||||
"APPS": 4.05, "CosQA": 32.76, "Text2SQL": 45.59, "CodeSearchNet": 69.60,
|
||||
"CCR": 45.56, "Contest-DL": 38.50, "StackOverflow": 21.71,
|
||||
"FB-ST": 73.55, "FB-MT": 64.99, "Average": 42.77
|
||||
},
|
||||
"BGE-M3": {
|
||||
"APPS": 7.37, "CosQA": 22.73, "Text2SQL": 48.76, "CodeSearchNet": 43.23,
|
||||
"CCR": 47.55, "Contest-DL": 47.86, "StackOverflow": 31.16,
|
||||
"FB-ST": 61.04, "FB-MT": 49.94, "Average": 39.31
|
||||
},
|
||||
"UniXcoder": {
|
||||
"APPS": 1.36, "CosQA": 25.14, "Text2SQL": 50.45, "CodeSearchNet": 60.20,
|
||||
"CCR": 58.36, "Contest-DL": 41.82, "StackOverflow": 31.03,
|
||||
"FB-ST": 44.67, "FB-MT": 36.02, "Average": 37.33
|
||||
},
|
||||
"GTE-Base": {
|
||||
"APPS": 3.24, "CosQA": 30.24, "Text2SQL": 46.19, "CodeSearchNet": 43.35,
|
||||
"CCR": 35.50, "Contest-DL": 33.81, "StackOverflow": 28.80,
|
||||
"FB-ST": 62.71, "FB-MT": 55.19, "Average": 36.75
|
||||
},
|
||||
"Contriever": {
|
||||
"APPS": 5.14, "CosQA": 14.21, "Text2SQL": 45.46, "CodeSearchNet": 34.72,
|
||||
"CCR": 35.74, "Contest-DL": 44.16, "StackOverflow": 24.21,
|
||||
"FB-ST": 66.05, "FB-MT": 55.11, "Average": 36.40
|
||||
},
|
||||
}
|
||||
|
||||
# Recent models (2025)
|
||||
RECENT_MODELS = {
|
||||
"Voyage-Code-3": {"Average": 62.5, "note": "13.8% better than OpenAI-v3-large"},
|
||||
"SFR-Embedding-Code-7B": {"Average": 67.4, "note": "#1 on CoIR (Feb 2025)"},
|
||||
"Jina-Code-v2": {"CosQA": 41.0, "note": "Strong on CosQA"},
|
||||
"CodeSage-Large": {"Average": 53.5, "note": "Specialized code model"},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TEST DATA: Synthetic CoIR-like datasets for local evaluation
|
||||
# =============================================================================
|
||||
|
||||
def create_test_datasets():
|
||||
"""Create synthetic test datasets mimicking CoIR task types."""
|
||||
|
||||
# Text-to-Code (like CosQA, CodeSearchNet)
|
||||
text_to_code = {
|
||||
"name": "Text-to-Code",
|
||||
"description": "Natural language queries to code snippets",
|
||||
"corpus": [
|
||||
{"id": "c1", "text": "def authenticate_user(username: str, password: str) -> bool:\n user = db.get_user(username)\n if user and verify_hash(password, user.password_hash):\n return True\n return False"},
|
||||
{"id": "c2", "text": "async function fetchUserData(userId) {\n const response = await fetch(`/api/users/${userId}`);\n if (!response.ok) throw new Error('User not found');\n return response.json();\n}"},
|
||||
{"id": "c3", "text": "def calculate_statistics(data: List[float]) -> Dict[str, float]:\n return {\n 'mean': np.mean(data),\n 'std': np.std(data),\n 'median': np.median(data)\n }"},
|
||||
{"id": "c4", "text": "SELECT u.id, u.name, u.email, COUNT(o.id) as order_count\nFROM users u LEFT JOIN orders o ON u.id = o.user_id\nWHERE u.status = 'active'\nGROUP BY u.id, u.name, u.email"},
|
||||
{"id": "c5", "text": "def merge_sort(arr: List[int]) -> List[int]:\n if len(arr) <= 1:\n return arr\n mid = len(arr) // 2\n left = merge_sort(arr[:mid])\n right = merge_sort(arr[mid:])\n return merge(left, right)"},
|
||||
{"id": "c6", "text": "app.post('/api/auth/login', async (req, res) => {\n const { email, password } = req.body;\n const user = await User.findByEmail(email);\n if (!user || !await bcrypt.compare(password, user.password)) {\n return res.status(401).json({ error: 'Invalid credentials' });\n }\n const token = jwt.sign({ userId: user.id }, process.env.JWT_SECRET);\n res.json({ token });\n});"},
|
||||
{"id": "c7", "text": "CREATE TABLE products (\n id SERIAL PRIMARY KEY,\n name VARCHAR(255) NOT NULL,\n price DECIMAL(10, 2) NOT NULL,\n category_id INTEGER REFERENCES categories(id),\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n);"},
|
||||
{"id": "c8", "text": "def read_json_file(filepath: str) -> Dict:\n with open(filepath, 'r', encoding='utf-8') as f:\n return json.load(f)"},
|
||||
{"id": "c9", "text": "class UserRepository:\n def __init__(self, session):\n self.session = session\n \n def find_by_email(self, email: str) -> Optional[User]:\n return self.session.query(User).filter(User.email == email).first()"},
|
||||
{"id": "c10", "text": "try:\n result = await process_data(input_data)\nexcept ValidationError as e:\n logger.error(f'Validation failed: {e}')\n raise HTTPException(status_code=400, detail=str(e))\nexcept DatabaseError as e:\n logger.critical(f'Database error: {e}')\n raise HTTPException(status_code=500, detail='Internal server error')"},
|
||||
],
|
||||
"queries": [
|
||||
{"id": "q1", "text": "function to verify user password and authenticate", "relevant": ["c1", "c6"]},
|
||||
{"id": "q2", "text": "async http request to fetch user data", "relevant": ["c2"]},
|
||||
{"id": "q3", "text": "calculate mean median standard deviation statistics", "relevant": ["c3"]},
|
||||
{"id": "q4", "text": "SQL query join users and orders count", "relevant": ["c4", "c7"]},
|
||||
{"id": "q5", "text": "recursive sorting algorithm implementation", "relevant": ["c5"]},
|
||||
{"id": "q6", "text": "REST API login endpoint with JWT token", "relevant": ["c6", "c1"]},
|
||||
{"id": "q7", "text": "create database table with foreign key", "relevant": ["c7"]},
|
||||
{"id": "q8", "text": "read and parse JSON file python", "relevant": ["c8"]},
|
||||
{"id": "q9", "text": "repository pattern find user by email", "relevant": ["c9", "c1"]},
|
||||
{"id": "q10", "text": "exception handling with logging", "relevant": ["c10"]},
|
||||
]
|
||||
}
|
||||
|
||||
# Code-to-Code (like CCR)
|
||||
code_to_code = {
|
||||
"name": "Code-to-Code",
|
||||
"description": "Find similar code implementations",
|
||||
"corpus": [
|
||||
{"id": "c1", "text": "def add(a, b): return a + b"},
|
||||
{"id": "c2", "text": "function sum(x, y) { return x + y; }"},
|
||||
{"id": "c3", "text": "func add(a int, b int) int { return a + b }"},
|
||||
{"id": "c4", "text": "def subtract(a, b): return a - b"},
|
||||
{"id": "c5", "text": "def multiply(a, b): return a * b"},
|
||||
{"id": "c6", "text": "const add = (a, b) => a + b;"},
|
||||
{"id": "c7", "text": "fn add(a: i32, b: i32) -> i32 { a + b }"},
|
||||
{"id": "c8", "text": "public int add(int a, int b) { return a + b; }"},
|
||||
],
|
||||
"queries": [
|
||||
{"id": "q1", "text": "def add(a, b): return a + b", "relevant": ["c1", "c2", "c3", "c6", "c7", "c8"]},
|
||||
{"id": "q2", "text": "def subtract(x, y): return x - y", "relevant": ["c4"]},
|
||||
{"id": "q3", "text": "def mult(x, y): return x * y", "relevant": ["c5"]},
|
||||
]
|
||||
}
|
||||
|
||||
# Text2SQL
|
||||
text2sql = {
|
||||
"name": "Text2SQL",
|
||||
"description": "Natural language to SQL queries",
|
||||
"corpus": [
|
||||
{"id": "c1", "text": "SELECT * FROM users WHERE active = 1"},
|
||||
{"id": "c2", "text": "SELECT COUNT(*) FROM orders WHERE status = 'pending'"},
|
||||
{"id": "c3", "text": "SELECT u.name, SUM(o.total) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.name"},
|
||||
{"id": "c4", "text": "UPDATE products SET price = price * 1.1 WHERE category = 'electronics'"},
|
||||
{"id": "c5", "text": "DELETE FROM sessions WHERE expires_at < NOW()"},
|
||||
{"id": "c6", "text": "INSERT INTO users (name, email) VALUES ('John', 'john@example.com')"},
|
||||
],
|
||||
"queries": [
|
||||
{"id": "q1", "text": "get all active users", "relevant": ["c1"]},
|
||||
{"id": "q2", "text": "count pending orders", "relevant": ["c2"]},
|
||||
{"id": "q3", "text": "total order amount by user", "relevant": ["c3"]},
|
||||
{"id": "q4", "text": "increase electronics prices by 10%", "relevant": ["c4"]},
|
||||
{"id": "q5", "text": "remove expired sessions", "relevant": ["c5"]},
|
||||
{"id": "q6", "text": "add new user", "relevant": ["c6"]},
|
||||
]
|
||||
}
|
||||
|
||||
return [text_to_code, code_to_code, text2sql]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# EVALUATION FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def ndcg_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
|
||||
"""Calculate NDCG@k."""
|
||||
dcg = 0.0
|
||||
for i, doc_id in enumerate(ranked_list[:k]):
|
||||
if doc_id in relevant:
|
||||
dcg += 1.0 / np.log2(i + 2)
|
||||
|
||||
# Ideal DCG
|
||||
ideal_k = min(len(relevant), k)
|
||||
idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_k))
|
||||
|
||||
return dcg / idcg if idcg > 0 else 0.0
|
||||
|
||||
|
||||
def precision_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
|
||||
"""Calculate Precision@k."""
|
||||
retrieved = set(ranked_list[:k])
|
||||
relevant_set = set(relevant)
|
||||
return len(retrieved & relevant_set) / k
|
||||
|
||||
|
||||
def recall_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
|
||||
"""Calculate Recall@k."""
|
||||
retrieved = set(ranked_list[:k])
|
||||
relevant_set = set(relevant)
|
||||
return len(retrieved & relevant_set) / len(relevant_set) if relevant_set else 0.0
|
||||
|
||||
|
||||
def mrr(ranked_list: List[str], relevant: List[str]) -> float:
|
||||
"""Calculate Mean Reciprocal Rank."""
|
||||
for i, doc_id in enumerate(ranked_list):
|
||||
if doc_id in relevant:
|
||||
return 1.0 / (i + 1)
|
||||
return 0.0
|
||||
|
||||
|
||||
def evaluate_model(model_name: str, encode_fn, datasets: List[Dict]) -> Dict:
|
||||
"""Evaluate a model on all datasets."""
|
||||
results = {}
|
||||
|
||||
for dataset in datasets:
|
||||
corpus = dataset["corpus"]
|
||||
queries = dataset["queries"]
|
||||
|
||||
corpus_ids = [doc["id"] for doc in corpus]
|
||||
corpus_texts = [doc["text"] for doc in corpus]
|
||||
corpus_embs = encode_fn(corpus_texts)
|
||||
|
||||
metrics = {"ndcg@10": [], "precision@10": [], "recall@10": [], "mrr": []}
|
||||
|
||||
for query in queries:
|
||||
query_emb = encode_fn([query["text"]])[0]
|
||||
|
||||
# Compute similarity scores
|
||||
if hasattr(corpus_embs, 'shape') and len(corpus_embs.shape) == 2:
|
||||
# Dense vectors - cosine similarity
|
||||
q_norm = query_emb / (np.linalg.norm(query_emb) + 1e-8)
|
||||
c_norm = corpus_embs / (np.linalg.norm(corpus_embs, axis=1, keepdims=True) + 1e-8)
|
||||
scores = np.dot(c_norm, q_norm)
|
||||
else:
|
||||
# Sparse - dot product
|
||||
scores = np.array([np.dot(c, query_emb) for c in corpus_embs])
|
||||
|
||||
ranked_indices = np.argsort(scores)[::-1]
|
||||
ranked_ids = [corpus_ids[i] for i in ranked_indices]
|
||||
relevant = query["relevant"]
|
||||
|
||||
metrics["ndcg@10"].append(ndcg_at_k(ranked_ids, relevant, 10))
|
||||
metrics["precision@10"].append(precision_at_k(ranked_ids, relevant, 10))
|
||||
metrics["recall@10"].append(recall_at_k(ranked_ids, relevant, 10))
|
||||
metrics["mrr"].append(mrr(ranked_ids, relevant))
|
||||
|
||||
results[dataset["name"]] = {k: np.mean(v) * 100 for k, v in metrics.items()}
|
||||
|
||||
# Calculate average
|
||||
all_ndcg = [results[d["name"]]["ndcg@10"] for d in datasets]
|
||||
results["Average"] = {
|
||||
"ndcg@10": np.mean(all_ndcg),
|
||||
"note": "Average across all datasets"
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MODEL IMPLEMENTATIONS
|
||||
# =============================================================================
|
||||
|
||||
def get_splade_encoder():
|
||||
"""Get SPLADE encoding function."""
|
||||
from codexlens.semantic.splade_encoder import get_splade_encoder as _get_splade
|
||||
encoder = _get_splade()
|
||||
|
||||
def encode(texts):
|
||||
sparse_vecs = encoder.encode_batch(texts) if len(texts) > 1 else [encoder.encode_text(texts[0])]
|
||||
# Convert to dense for comparison
|
||||
vocab_size = encoder.vocab_size
|
||||
dense = np.zeros((len(sparse_vecs), vocab_size), dtype=np.float32)
|
||||
for i, sv in enumerate(sparse_vecs):
|
||||
for tid, w in sv.items():
|
||||
dense[i, tid] = w
|
||||
return dense
|
||||
|
||||
return encode
|
||||
|
||||
|
||||
def get_dense_encoder(model_name: str = "all-MiniLM-L6-v2"):
|
||||
"""Get dense embedding encoding function."""
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
def encode(texts):
|
||||
return model.encode(texts, show_progress_bar=False)
|
||||
|
||||
return encode
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# REPORT GENERATION
|
||||
# =============================================================================
|
||||
|
||||
def generate_report(local_results: Dict, output_path: str = None):
|
||||
"""Generate comprehensive benchmark report."""
|
||||
|
||||
report = []
|
||||
report.append("=" * 80)
|
||||
report.append("CODE RETRIEVAL BENCHMARK REPORT")
|
||||
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
report.append("=" * 80)
|
||||
|
||||
# Section 1: Reference Benchmark Scores
|
||||
report.append("\n## 1. CoIR Benchmark Reference Scores (Published)")
|
||||
report.append("\nSource: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883")
|
||||
report.append("\n### NDCG@10 Scores by Model and Dataset\n")
|
||||
|
||||
# Header
|
||||
datasets = ["APPS", "CosQA", "Text2SQL", "CodeSearchNet", "CCR", "Contest-DL", "StackOverflow", "FB-ST", "FB-MT", "Average"]
|
||||
header = "| Model | " + " | ".join(datasets) + " |"
|
||||
separator = "|" + "|".join(["---"] * (len(datasets) + 1)) + "|"
|
||||
report.append(header)
|
||||
report.append(separator)
|
||||
|
||||
# Data rows
|
||||
for model, scores in COIR_REFERENCE_SCORES.items():
|
||||
row = f"| {model} | " + " | ".join([f"{scores.get(d, '-'):.2f}" if isinstance(scores.get(d), (int, float)) else str(scores.get(d, '-')) for d in datasets]) + " |"
|
||||
report.append(row)
|
||||
|
||||
# Section 2: Recent Models
|
||||
report.append("\n### Recent Top Performers (2025)\n")
|
||||
report.append("| Model | Average NDCG@10 | Notes |")
|
||||
report.append("|-------|-----------------|-------|")
|
||||
for model, info in RECENT_MODELS.items():
|
||||
avg = info.get("Average", "-")
|
||||
note = info.get("note", "")
|
||||
report.append(f"| {model} | {avg} | {note} |")
|
||||
|
||||
# Section 3: Local Evaluation Results
|
||||
report.append("\n## 2. Local Evaluation Results\n")
|
||||
report.append("Evaluated on synthetic CoIR-like datasets\n")
|
||||
|
||||
for model_name, results in local_results.items():
|
||||
report.append(f"\n### {model_name}\n")
|
||||
report.append("| Dataset | NDCG@10 | Precision@10 | Recall@10 | MRR |")
|
||||
report.append("|---------|---------|--------------|-----------|-----|")
|
||||
for dataset_name, metrics in results.items():
|
||||
if dataset_name == "Average":
|
||||
continue
|
||||
ndcg = metrics.get("ndcg@10", 0)
|
||||
prec = metrics.get("precision@10", 0)
|
||||
rec = metrics.get("recall@10", 0)
|
||||
m = metrics.get("mrr", 0)
|
||||
report.append(f"| {dataset_name} | {ndcg:.2f} | {prec:.2f} | {rec:.2f} | {m:.2f} |")
|
||||
|
||||
if "Average" in results:
|
||||
avg = results["Average"]["ndcg@10"]
|
||||
report.append(f"| **Average** | **{avg:.2f}** | - | - | - |")
|
||||
|
||||
# Section 4: Comparison Analysis
|
||||
report.append("\n## 3. Comparison Analysis\n")
|
||||
|
||||
if "SPLADE" in local_results and "Dense (MiniLM)" in local_results:
|
||||
splade_avg = local_results["SPLADE"]["Average"]["ndcg@10"]
|
||||
dense_avg = local_results["Dense (MiniLM)"]["Average"]["ndcg@10"]
|
||||
|
||||
report.append("### SPLADE vs Dense Embedding\n")
|
||||
report.append(f"- SPLADE Average NDCG@10: {splade_avg:.2f}")
|
||||
report.append(f"- Dense (MiniLM) Average NDCG@10: {dense_avg:.2f}")
|
||||
|
||||
if splade_avg > dense_avg:
|
||||
diff = ((splade_avg - dense_avg) / dense_avg) * 100
|
||||
report.append(f"- SPLADE outperforms by {diff:.1f}%")
|
||||
else:
|
||||
diff = ((dense_avg - splade_avg) / splade_avg) * 100
|
||||
report.append(f"- Dense outperforms by {diff:.1f}%")
|
||||
|
||||
# Section 5: Key Insights
|
||||
report.append("\n## 4. Key Insights\n")
|
||||
report.append("""
|
||||
1. **Voyage-Code-002** achieved highest mean score (56.26) on original CoIR benchmark
|
||||
2. **SFR-Embedding-Code-7B** (Salesforce) reached #1 in Feb 2025 with 67.4 average
|
||||
3. **SPLADE** provides good balance of:
|
||||
- Interpretability (visible token activations)
|
||||
- Query expansion (learned synonyms)
|
||||
- Efficient sparse retrieval
|
||||
|
||||
4. **Task-specific performance varies significantly**:
|
||||
- E5-Mistral excels at Contest-DL (82.55) but median on APPS
|
||||
- Voyage-Code-002 excels at CodeSearchNet (81.79)
|
||||
- No single model dominates all tasks
|
||||
|
||||
5. **Hybrid approaches recommended**:
|
||||
- Combine sparse (SPLADE/BM25) with dense for best results
|
||||
- Use RRF (Reciprocal Rank Fusion) for score combination
|
||||
""")
|
||||
|
||||
# Section 6: Recommendations
|
||||
report.append("\n## 5. Recommendations for Codex-lens\n")
|
||||
report.append("""
|
||||
| Use Case | Recommended Approach |
|
||||
|----------|---------------------|
|
||||
| General code search | SPLADE + Dense hybrid |
|
||||
| Exact keyword match | FTS (BM25) |
|
||||
| Semantic understanding | Dense embedding |
|
||||
| Interpretable results | SPLADE only |
|
||||
| Maximum accuracy | SFR-Embedding-Code + SPLADE fusion |
|
||||
""")
|
||||
|
||||
report_text = "\n".join(report)
|
||||
|
||||
if output_path:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(report_text)
|
||||
print(f"Report saved to: {output_path}")
|
||||
|
||||
return report_text
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
print("=" * 80)
|
||||
print("CODE RETRIEVAL BENCHMARK EVALUATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Create test datasets
|
||||
print("\nCreating test datasets...")
|
||||
datasets = create_test_datasets()
|
||||
print(f" Created {len(datasets)} datasets")
|
||||
|
||||
local_results = {}
|
||||
|
||||
# Evaluate SPLADE
|
||||
print("\nEvaluating SPLADE...")
|
||||
try:
|
||||
from codexlens.semantic.splade_encoder import check_splade_available
|
||||
ok, err = check_splade_available()
|
||||
if ok:
|
||||
start = time.perf_counter()
|
||||
splade_encode = get_splade_encoder()
|
||||
splade_results = evaluate_model("SPLADE", splade_encode, datasets)
|
||||
elapsed = time.perf_counter() - start
|
||||
local_results["SPLADE"] = splade_results
|
||||
print(f" SPLADE evaluated in {elapsed:.2f}s")
|
||||
print(f" Average NDCG@10: {splade_results['Average']['ndcg@10']:.2f}")
|
||||
else:
|
||||
print(f" SPLADE not available: {err}")
|
||||
except Exception as e:
|
||||
print(f" SPLADE evaluation failed: {e}")
|
||||
|
||||
# Evaluate Dense (MiniLM)
|
||||
print("\nEvaluating Dense (all-MiniLM-L6-v2)...")
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
dense_encode = get_dense_encoder("all-MiniLM-L6-v2")
|
||||
dense_results = evaluate_model("Dense (MiniLM)", dense_encode, datasets)
|
||||
elapsed = time.perf_counter() - start
|
||||
local_results["Dense (MiniLM)"] = dense_results
|
||||
print(f" Dense evaluated in {elapsed:.2f}s")
|
||||
print(f" Average NDCG@10: {dense_results['Average']['ndcg@10']:.2f}")
|
||||
except Exception as e:
|
||||
print(f" Dense evaluation failed: {e}")
|
||||
|
||||
# Generate report
|
||||
print("\nGenerating report...")
|
||||
report = generate_report(local_results, "benchmark_report.md")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("BENCHMARK COMPLETE")
|
||||
print("=" * 80)
|
||||
print("\nReport preview:\n")
|
||||
print(report[:3000] + "\n...[truncated]...")
|
||||
|
||||
return local_results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,318 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Debug script to trace semantic search (dense_rerank) flow step by step."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
||||
|
||||
# Configure detailed logging
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s | %(levelname)-5s | %(name)s | %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
# Enable debug for specific modules
|
||||
for name in ["codexlens.search", "codexlens.semantic", "codexlens.indexing"]:
|
||||
logging.getLogger(name).setLevel(logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger("debug_semantic")
|
||||
|
||||
|
||||
def load_config() -> Dict[str, Any]:
|
||||
"""Load config from codexlens settings."""
|
||||
config_path = Path.home() / ".codexlens" / "config.json"
|
||||
if config_path.exists():
|
||||
with open(config_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def inspect_hnsw_index(index_root: Path) -> Dict[str, Any]:
|
||||
"""Inspect centralized HNSW index metadata."""
|
||||
hnsw_path = index_root / "_vectors.hnsw"
|
||||
meta_path = index_root / "_vectors_meta.db"
|
||||
|
||||
result = {
|
||||
"hnsw_exists": hnsw_path.exists(),
|
||||
"meta_exists": meta_path.exists(),
|
||||
"hnsw_size_mb": round(hnsw_path.stat().st_size / (1024*1024), 2) if hnsw_path.exists() else 0,
|
||||
}
|
||||
|
||||
if meta_path.exists():
|
||||
conn = sqlite3.connect(str(meta_path))
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM chunk_metadata")
|
||||
result["total_chunks"] = cursor.fetchone()[0]
|
||||
|
||||
# Sample file paths
|
||||
cursor = conn.execute("""
|
||||
SELECT DISTINCT file_path FROM chunk_metadata
|
||||
ORDER BY file_path LIMIT 20
|
||||
""")
|
||||
result["sample_files"] = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
# Check if tests vs src
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN file_path LIKE '%tests%' OR file_path LIKE '%test_%' THEN 'test'
|
||||
ELSE 'src'
|
||||
END as category,
|
||||
COUNT(*) as count
|
||||
FROM chunk_metadata
|
||||
GROUP BY category
|
||||
""")
|
||||
result["category_distribution"] = {row[0]: row[1] for row in cursor.fetchall()}
|
||||
|
||||
conn.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_dense_search(query: str, index_root: Path, top_k: int = 50) -> List[Tuple[int, float, str]]:
|
||||
"""Execute dense vector search and return candidates with details."""
|
||||
from codexlens.semantic.ann_index import ANNIndex
|
||||
from codexlens.semantic.factory import get_embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("STAGE 1: Dense Embedding Generation")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Read model config from index
|
||||
index_db = index_root / "_index.db"
|
||||
embedding_model = "qwen3-embedding-sf"
|
||||
embedding_backend = "litellm"
|
||||
|
||||
if index_db.exists():
|
||||
try:
|
||||
with VectorStore(index_db) as vs:
|
||||
model_config = vs.get_model_config()
|
||||
if model_config:
|
||||
embedding_backend = model_config.get("backend", embedding_backend)
|
||||
embedding_model = model_config.get("model_name", embedding_model)
|
||||
logger.info(f"Model config from index: {embedding_backend}/{embedding_model}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read model config: {e}")
|
||||
|
||||
# Generate query embedding
|
||||
embedder = get_embedder(backend=embedding_backend, model=embedding_model)
|
||||
query_embedding = embedder.embed_to_numpy([query])[0]
|
||||
logger.info(f"Query: {query!r}")
|
||||
logger.info(f"Query embedding dim: {query_embedding.shape[0]}")
|
||||
logger.info(f"Query embedding norm: {(query_embedding**2).sum()**0.5:.4f}")
|
||||
|
||||
# Load HNSW index
|
||||
logger.info("=" * 60)
|
||||
logger.info("STAGE 2: HNSW Vector Search (Coarse)")
|
||||
logger.info("=" * 60)
|
||||
|
||||
ann_index = ANNIndex.create_central(
|
||||
index_root=index_root,
|
||||
dim=query_embedding.shape[0],
|
||||
)
|
||||
if not ann_index.load():
|
||||
logger.error("Failed to load HNSW index")
|
||||
return []
|
||||
|
||||
logger.info(f"HNSW index count: {ann_index.count()}")
|
||||
|
||||
# Execute search
|
||||
ids, distances = ann_index.search(query_embedding, top_k=top_k)
|
||||
logger.info(f"Found {len(ids)} candidates")
|
||||
|
||||
# Get chunk details
|
||||
candidates = []
|
||||
meta_path = index_root / "_vectors_meta.db"
|
||||
if meta_path.exists():
|
||||
conn = sqlite3.connect(str(meta_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
for chunk_id, distance in zip(ids, distances):
|
||||
cursor = conn.execute("""
|
||||
SELECT file_path, content, start_line, end_line
|
||||
FROM chunk_metadata WHERE chunk_id = ?
|
||||
""", (int(chunk_id),))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
candidates.append((
|
||||
int(chunk_id),
|
||||
float(distance),
|
||||
row["file_path"],
|
||||
row["content"][:200] if row["content"] else "",
|
||||
row["start_line"],
|
||||
row["end_line"],
|
||||
))
|
||||
conn.close()
|
||||
|
||||
# Print top candidates
|
||||
logger.info("\nTop 20 Dense Search Candidates:")
|
||||
logger.info("-" * 80)
|
||||
for i, (cid, dist, path, content, start, end) in enumerate(candidates[:20]):
|
||||
score = max(0, 1 - dist)
|
||||
is_test = "tests/" in path or "test_" in Path(path).name
|
||||
marker = "[TEST]" if is_test else "[SRC]"
|
||||
logger.info(f"{i+1:2d}. {marker} dist={dist:.4f} score={score:.4f}")
|
||||
logger.info(f" {path}:{start}-{end}")
|
||||
logger.info(f" {content[:100]}...")
|
||||
logger.info("")
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def run_reranking(query: str, candidates: List[Tuple], top_k: int = 10) -> List[Tuple[str, float, float]]:
|
||||
"""Execute cross-encoder reranking on candidates."""
|
||||
from codexlens.semantic.reranker import get_reranker, check_reranker_available
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("STAGE 3: Cross-Encoder Reranking")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Check reranker availability
|
||||
config = load_config()
|
||||
backend = config.get("reranker_backend", "api")
|
||||
model = config.get("reranker_model", "Qwen/Qwen3-Reranker-8B")
|
||||
|
||||
logger.info(f"Reranker backend: {backend}")
|
||||
logger.info(f"Reranker model: {model}")
|
||||
|
||||
ok, err = check_reranker_available(backend)
|
||||
if not ok:
|
||||
logger.error(f"Reranker not available: {err}")
|
||||
return []
|
||||
|
||||
reranker = get_reranker(backend=backend, model_name=model)
|
||||
|
||||
# Prepare pairs for reranking
|
||||
pairs = []
|
||||
for cid, dist, path, content, start, end in candidates[:50]: # Top 50 for reranking
|
||||
doc_text = content if content else path
|
||||
pairs.append((query, doc_text))
|
||||
|
||||
logger.info(f"Reranking {len(pairs)} candidates...")
|
||||
|
||||
# Execute reranking
|
||||
scores = reranker.score_pairs(pairs, batch_size=32)
|
||||
|
||||
# Combine scores
|
||||
results = []
|
||||
for i, (cid, dist, path, content, start, end) in enumerate(candidates[:len(scores)]):
|
||||
dense_score = max(0, 1 - dist)
|
||||
rerank_score = scores[i]
|
||||
combined = 0.5 * dense_score + 0.5 * rerank_score
|
||||
is_test = "tests/" in path or "test_" in Path(path).name
|
||||
results.append((path, dense_score, rerank_score, combined, is_test, content[:100]))
|
||||
|
||||
# Sort by combined score
|
||||
results.sort(key=lambda x: x[3], reverse=True)
|
||||
|
||||
logger.info("\nTop 20 Reranked Results:")
|
||||
logger.info("-" * 100)
|
||||
logger.info(f"{'Rank':>4} {'Type':^6} {'Dense':^8} {'Rerank':^8} {'Combined':^8} Path")
|
||||
logger.info("-" * 100)
|
||||
for i, (path, dense, rerank, combined, is_test, content) in enumerate(results[:20]):
|
||||
marker = "TEST" if is_test else "SRC"
|
||||
logger.info(f"{i+1:4d} [{marker:^4}] {dense:8.4f} {rerank:8.4f} {combined:8.4f} {path}")
|
||||
|
||||
return results[:top_k]
|
||||
|
||||
|
||||
def analyze_problem(candidates: List[Tuple], results: List[Tuple]):
|
||||
"""Analyze why tests might rank higher than src files."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("ANALYSIS: Why Tests Rank Higher?")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Count test vs src in dense candidates
|
||||
test_in_dense = sum(1 for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name)
|
||||
src_in_dense = 50 - test_in_dense
|
||||
|
||||
logger.info(f"\nDense Search (top 50):")
|
||||
logger.info(f" - Test files: {test_in_dense} ({test_in_dense*2}%)")
|
||||
logger.info(f" - Src files: {src_in_dense} ({src_in_dense*2}%)")
|
||||
|
||||
# Average scores by category
|
||||
test_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name]
|
||||
src_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if not ("tests/" in c[2] or "test_" in Path(c[2]).name)]
|
||||
|
||||
if test_dense_scores:
|
||||
logger.info(f"\nDense Score Averages:")
|
||||
logger.info(f" - Test files: {sum(test_dense_scores)/len(test_dense_scores):.4f}")
|
||||
if src_dense_scores:
|
||||
logger.info(f" - Src files: {sum(src_dense_scores)/len(src_dense_scores):.4f}")
|
||||
|
||||
# Check rerank score distribution
|
||||
test_results = [r for r in results if r[4]]
|
||||
src_results = [r for r in results if not r[4]]
|
||||
|
||||
if test_results and src_results:
|
||||
logger.info(f"\nRerank Score Averages:")
|
||||
logger.info(f" - Test files: {sum(r[2] for r in test_results)/len(test_results):.4f}")
|
||||
logger.info(f" - Src files: {sum(r[2] for r in src_results)/len(src_results):.4f}")
|
||||
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("HYPOTHESIS:")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if test_in_dense > src_in_dense:
|
||||
logger.info("→ Problem is at DENSE SEARCH stage")
|
||||
logger.info(" Test files have embeddings closer to query")
|
||||
logger.info(" Possible causes:")
|
||||
logger.info(" 1. Test files mention implementation concepts in comments/docstrings")
|
||||
logger.info(" 2. Embedding model doesn't distinguish between tests and implementation")
|
||||
logger.info(" 3. Test file chunks are more frequent in the index")
|
||||
else:
|
||||
logger.info("→ Problem may be at RERANKING stage")
|
||||
logger.info(" Reranker gives higher scores to test content")
|
||||
|
||||
|
||||
def main():
|
||||
query = "文件索引和嵌入向量生成的实现逻辑"
|
||||
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3")
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("DEBUG: Semantic Search Analysis")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Query: {query}")
|
||||
logger.info(f"Index root: {index_root}")
|
||||
logger.info("")
|
||||
|
||||
# Step 1: Inspect index
|
||||
logger.info("STEP 0: Index Inspection")
|
||||
logger.info("-" * 60)
|
||||
index_info = inspect_hnsw_index(index_root)
|
||||
for k, v in index_info.items():
|
||||
if k == "sample_files":
|
||||
logger.info(f" {k}:")
|
||||
for f in v[:10]:
|
||||
logger.info(f" - {f}")
|
||||
elif k == "category_distribution":
|
||||
logger.info(f" {k}:")
|
||||
for cat, count in v.items():
|
||||
logger.info(f" - {cat}: {count}")
|
||||
else:
|
||||
logger.info(f" {k}: {v}")
|
||||
logger.info("")
|
||||
|
||||
# Step 2: Dense search
|
||||
candidates = run_dense_search(query, index_root, top_k=100)
|
||||
|
||||
if not candidates:
|
||||
logger.error("No candidates from dense search")
|
||||
return
|
||||
|
||||
# Step 3: Reranking
|
||||
results = run_reranking(query, candidates, top_k=20)
|
||||
|
||||
# Step 4: Analyze
|
||||
analyze_problem(candidates, results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,276 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""Debug script v2: Trace the full semantic search flow with detailed logging."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)-5s | %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("debug")
|
||||
|
||||
|
||||
def count_chunks_by_category(index_root: Path) -> Dict[str, int]:
|
||||
"""Count chunks by category (src vs test) across all indexes."""
|
||||
counts = defaultdict(int)
|
||||
|
||||
for db_path in index_root.rglob("_index.db"):
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.execute("""
|
||||
SELECT file_path FROM semantic_chunks
|
||||
""")
|
||||
for row in cursor:
|
||||
path = row[0]
|
||||
if "tests" in path or "test_" in Path(path).name:
|
||||
counts["test"] += 1
|
||||
else:
|
||||
counts["src"] += 1
|
||||
conn.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
return dict(counts)
|
||||
|
||||
|
||||
def run_dense_search_with_trace(query: str, source_path: Path) -> List[Dict]:
|
||||
"""Run dense search with detailed tracing."""
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.storage.registry import Registry
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
# Load config
|
||||
config = Config.load()
|
||||
registry = Registry(config.data_dir)
|
||||
mapper = PathMapper(config.data_dir)
|
||||
|
||||
# Create search engine with verbose logging
|
||||
engine = ChainSearchEngine(registry, mapper, config=config)
|
||||
engine.logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Set up handler to capture all log output
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.DEBUG)
|
||||
engine.logger.addHandler(handler)
|
||||
|
||||
# Execute cascade search with dense_rerank strategy
|
||||
options = SearchOptions(depth=-1) # Search all subdirectories
|
||||
|
||||
logger.info("=" * 70)
|
||||
logger.info("Executing dense_rerank cascade search...")
|
||||
logger.info(f"Query: {query}")
|
||||
logger.info(f"Source: {source_path}")
|
||||
logger.info("=" * 70)
|
||||
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=20,
|
||||
coarse_k=100,
|
||||
options=options,
|
||||
strategy="dense_rerank"
|
||||
)
|
||||
|
||||
# Analyze results
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("SEARCH RESULTS ANALYSIS")
|
||||
logger.info("=" * 70)
|
||||
|
||||
test_count = 0
|
||||
src_count = 0
|
||||
results_detail = []
|
||||
|
||||
for i, r in enumerate(result.results):
|
||||
is_test = "tests" in r.path or "test_" in Path(r.path).name
|
||||
if is_test:
|
||||
test_count += 1
|
||||
category = "TEST"
|
||||
else:
|
||||
src_count += 1
|
||||
category = "SRC"
|
||||
|
||||
# Get metadata scores if available
|
||||
pre_ce_score = r.metadata.get("pre_cross_encoder_score", r.score)
|
||||
ce_score = r.metadata.get("cross_encoder_score", 0)
|
||||
ce_prob = r.metadata.get("cross_encoder_prob", 0)
|
||||
|
||||
results_detail.append({
|
||||
"rank": i + 1,
|
||||
"category": category,
|
||||
"path": r.path,
|
||||
"score": r.score,
|
||||
"pre_ce_score": pre_ce_score,
|
||||
"ce_score": ce_score,
|
||||
"ce_prob": ce_prob,
|
||||
"excerpt": r.excerpt[:100] if r.excerpt else "",
|
||||
})
|
||||
|
||||
logger.info(f"{i+1:2d}. [{category:4s}] score={r.score:.4f} pre_ce={pre_ce_score:.4f} ce={ce_score:.4f}")
|
||||
logger.info(f" {r.path}")
|
||||
if r.excerpt:
|
||||
logger.info(f" {r.excerpt[:80]}...")
|
||||
logger.info("")
|
||||
|
||||
logger.info(f"\nSummary: {src_count} SRC files, {test_count} TEST files in top {len(result.results)}")
|
||||
logger.info(f"Search time: {result.stats.time_ms:.2f}ms")
|
||||
|
||||
return results_detail
|
||||
|
||||
|
||||
def compare_coarse_candidates():
|
||||
"""Compare coarse candidates before and after reranking."""
|
||||
from codexlens.config import Config
|
||||
from codexlens.semantic.factory import get_embedder
|
||||
from codexlens.semantic.ann_index import ANNIndex
|
||||
|
||||
query = "文件索引和嵌入向量生成的实现逻辑"
|
||||
config = Config.load()
|
||||
|
||||
# Generate query embedding
|
||||
embedder = get_embedder(backend="litellm", model="qwen3-embedding-sf")
|
||||
query_embedding = embedder.embed_to_numpy([query])[0]
|
||||
|
||||
logger.info("=" * 70)
|
||||
logger.info("COARSE CANDIDATE ANALYSIS (per directory)")
|
||||
logger.info("=" * 70)
|
||||
|
||||
# Scan all HNSW indexes
|
||||
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
|
||||
|
||||
all_candidates = []
|
||||
|
||||
for hnsw_path in index_root.rglob("_index_vectors.hnsw"):
|
||||
db_path = hnsw_path.parent / "_index.db"
|
||||
if not db_path.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
ann_index = ANNIndex(db_path, dim=query_embedding.shape[0])
|
||||
if not ann_index.load() or ann_index.count() == 0:
|
||||
continue
|
||||
|
||||
ids, distances = ann_index.search(query_embedding, top_k=10)
|
||||
|
||||
# Get file paths from chunks
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
dir_name = hnsw_path.parent.relative_to(index_root)
|
||||
|
||||
for chunk_id, dist in zip(ids, distances):
|
||||
cursor = conn.execute("""
|
||||
SELECT file_path, content FROM semantic_chunks WHERE id = ?
|
||||
""", (int(chunk_id),))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
is_test = "tests" in row["file_path"] or "test_" in Path(row["file_path"]).name
|
||||
all_candidates.append({
|
||||
"dir": str(dir_name),
|
||||
"chunk_id": int(chunk_id),
|
||||
"distance": float(dist),
|
||||
"score": max(0, 1 - float(dist)),
|
||||
"is_test": is_test,
|
||||
"file_path": row["file_path"],
|
||||
"content_preview": row["content"][:100] if row["content"] else ""
|
||||
})
|
||||
conn.close()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing {hnsw_path}: {e}")
|
||||
|
||||
# Sort by distance (closest first)
|
||||
all_candidates.sort(key=lambda x: x["distance"])
|
||||
|
||||
logger.info(f"\nTotal coarse candidates across all directories: {len(all_candidates)}")
|
||||
|
||||
# Analyze distribution
|
||||
test_candidates = [c for c in all_candidates if c["is_test"]]
|
||||
src_candidates = [c for c in all_candidates if not c["is_test"]]
|
||||
|
||||
logger.info(f"Test files: {len(test_candidates)}")
|
||||
logger.info(f"Src files: {len(src_candidates)}")
|
||||
|
||||
if test_candidates:
|
||||
avg_test_dist = sum(c["distance"] for c in test_candidates) / len(test_candidates)
|
||||
logger.info(f"Avg test distance: {avg_test_dist:.4f}")
|
||||
if src_candidates:
|
||||
avg_src_dist = sum(c["distance"] for c in src_candidates) / len(src_candidates)
|
||||
logger.info(f"Avg src distance: {avg_src_dist:.4f}")
|
||||
|
||||
logger.info("\nTop 30 candidates (combined from all directories):")
|
||||
logger.info("-" * 90)
|
||||
for i, c in enumerate(all_candidates[:30]):
|
||||
cat = "TEST" if c["is_test"] else "SRC"
|
||||
logger.info(f"{i+1:2d}. [{cat:4s}] dist={c['distance']:.4f} score={c['score']:.4f} dir={c['dir']}")
|
||||
logger.info(f" {Path(c['file_path']).name}")
|
||||
|
||||
return all_candidates
|
||||
|
||||
|
||||
def main():
|
||||
logger.info("=" * 70)
|
||||
logger.info("SEMANTIC SEARCH DEBUG SESSION")
|
||||
logger.info("=" * 70)
|
||||
|
||||
# Step 1: Count chunks distribution
|
||||
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
|
||||
counts = count_chunks_by_category(index_root)
|
||||
logger.info(f"\nChunk distribution in index:")
|
||||
logger.info(f" - Test chunks: {counts.get('test', 0)}")
|
||||
logger.info(f" - Src chunks: {counts.get('src', 0)}")
|
||||
|
||||
# Step 2: Compare coarse candidates
|
||||
logger.info("\n")
|
||||
candidates = compare_coarse_candidates()
|
||||
|
||||
# Step 3: Run full search
|
||||
logger.info("\n")
|
||||
query = "文件索引和嵌入向量生成的实现逻辑"
|
||||
source_path = Path(r"D:\Claude_dms3\codex-lens")
|
||||
results = run_dense_search_with_trace(query, source_path)
|
||||
|
||||
# Summary
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("ROOT CAUSE ANALYSIS")
|
||||
logger.info("=" * 70)
|
||||
|
||||
test_in_top10 = sum(1 for r in results[:10] if r["category"] == "TEST")
|
||||
src_in_top10 = 10 - test_in_top10
|
||||
|
||||
logger.info(f"\nTop 10 results: {src_in_top10} SRC, {test_in_top10} TEST")
|
||||
|
||||
if test_in_top10 > src_in_top10:
|
||||
logger.info("\nPROBLEM: Test files dominate top results")
|
||||
logger.info("\nPossible causes:")
|
||||
logger.info(" 1. Test files mention implementation concepts explicitly")
|
||||
logger.info(" (e.g., docstrings describe what they test)")
|
||||
logger.info(" 2. Embedding model treats test descriptions as similar to")
|
||||
logger.info(" implementation descriptions")
|
||||
logger.info(" 3. Cross-encoder reranker gives higher scores to")
|
||||
logger.info(" descriptive test content over implementation code")
|
||||
|
||||
# Check if coarse candidates already favor tests
|
||||
test_in_coarse_top30 = sum(1 for c in candidates[:30] if c["is_test"])
|
||||
if test_in_coarse_top30 > 15:
|
||||
logger.info(f"\n → Dense coarse search already favors tests")
|
||||
logger.info(f" ({test_in_coarse_top30}/30 test files in coarse top-30)")
|
||||
logger.info(f" Problem is at EMBEDDING/DENSE SEARCH stage")
|
||||
else:
|
||||
logger.info(f"\n → Coarse search is balanced ({test_in_coarse_top30}/30 tests)")
|
||||
logger.info(f" Problem is at CROSS-ENCODER RERANKING stage")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
codex-lens/dist/codex_lens-0.1.0-py3-none-any.whl
vendored
BIN
codex-lens/dist/codex_lens-0.1.0-py3-none-any.whl
vendored
Binary file not shown.
BIN
codex-lens/dist/codex_lens-0.1.0.tar.gz
vendored
BIN
codex-lens/dist/codex_lens-0.1.0.tar.gz
vendored
Binary file not shown.
@@ -1,171 +0,0 @@
|
||||
# Chain Search Quick Reference
|
||||
|
||||
## Import
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.search import (
|
||||
ChainSearchEngine,
|
||||
SearchOptions,
|
||||
quick_search
|
||||
)
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
```
|
||||
|
||||
## One-Line Search
|
||||
|
||||
```python
|
||||
results = quick_search("query", Path("/path/to/search"), depth=-1)
|
||||
```
|
||||
|
||||
## Full Engine Usage
|
||||
|
||||
### 1. Initialize Engine
|
||||
```python
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
```
|
||||
|
||||
### 2. Configure Search
|
||||
```python
|
||||
options = SearchOptions(
|
||||
depth=-1, # -1 = unlimited, 0 = current dir only
|
||||
max_workers=8, # Parallel threads
|
||||
limit_per_dir=10, # Max results per directory
|
||||
total_limit=100, # Total result limit
|
||||
include_symbols=False, # Include symbol search
|
||||
files_only=False # Return only paths
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Execute Search
|
||||
```python
|
||||
result = engine.search("query", Path("/path"), options)
|
||||
|
||||
# Access results
|
||||
for r in result.results:
|
||||
print(f"{r.path}: score={r.score:.2f}")
|
||||
print(f" {r.excerpt}")
|
||||
|
||||
# Check statistics
|
||||
print(f"Searched {result.stats.dirs_searched} directories")
|
||||
print(f"Found {result.stats.files_matched} files")
|
||||
print(f"Time: {result.stats.time_ms:.2f}ms")
|
||||
```
|
||||
|
||||
### 4. Symbol Search
|
||||
```python
|
||||
symbols = engine.search_symbols(
|
||||
"function_name",
|
||||
Path("/path"),
|
||||
kind="function" # Optional: 'function', 'class', 'method', etc.
|
||||
)
|
||||
|
||||
for sym in symbols:
|
||||
print(f"{sym.name} ({sym.kind}) at lines {sym.range[0]}-{sym.range[1]}")
|
||||
```
|
||||
|
||||
### 5. Files-Only Mode
|
||||
```python
|
||||
paths = engine.search_files_only("query", Path("/path"))
|
||||
for path in paths:
|
||||
print(path)
|
||||
```
|
||||
|
||||
## SearchOptions Parameters
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `depth` | int | -1 | Search depth (-1 = unlimited) |
|
||||
| `max_workers` | int | 8 | Parallel worker threads |
|
||||
| `limit_per_dir` | int | 10 | Max results per directory |
|
||||
| `total_limit` | int | 100 | Total result limit |
|
||||
| `include_symbols` | bool | False | Include symbol search |
|
||||
| `files_only` | bool | False | Return only file paths |
|
||||
|
||||
## SearchResult Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `path` | str | File path |
|
||||
| `score` | float | BM25 relevance score |
|
||||
| `excerpt` | str | Highlighted text snippet |
|
||||
| `content` | str | Full matched content (optional) |
|
||||
| `symbol` | Symbol | Matched symbol (optional) |
|
||||
|
||||
## SearchStats Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `dirs_searched` | int | Number of directories searched |
|
||||
| `files_matched` | int | Number of files with matches |
|
||||
| `time_ms` | float | Total search time (milliseconds) |
|
||||
| `errors` | List[str] | Error messages |
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Search Current Project
|
||||
```python
|
||||
result = engine.search("authentication", Path.cwd())
|
||||
```
|
||||
|
||||
### Limit Depth for Speed
|
||||
```python
|
||||
options = SearchOptions(depth=2) # Only 2 levels deep
|
||||
result = engine.search("TODO", Path("/project"), options)
|
||||
```
|
||||
|
||||
### Find All Implementations
|
||||
```python
|
||||
symbols = engine.search_symbols("__init__", Path("/project"), kind="function")
|
||||
```
|
||||
|
||||
### Quick File List
|
||||
```python
|
||||
files = engine.search_files_only("config", Path("/project"))
|
||||
```
|
||||
|
||||
### Comprehensive Search
|
||||
```python
|
||||
options = SearchOptions(
|
||||
depth=-1,
|
||||
total_limit=500,
|
||||
include_symbols=True
|
||||
)
|
||||
result = engine.search("api", Path("/project"), options)
|
||||
print(f"Files: {len(result.results)}")
|
||||
print(f"Symbols: {len(result.symbols)}")
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
1. **Use depth limits** for faster searches in large codebases
|
||||
2. **Use files_only** when you don't need excerpts
|
||||
3. **Reuse ChainSearchEngine** instance for multiple searches
|
||||
4. **Adjust max_workers** based on CPU cores
|
||||
5. **Use limit_per_dir** to reduce memory usage
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
result = engine.search("query", Path("/path"))
|
||||
|
||||
if result.stats.errors:
|
||||
print("Errors occurred:")
|
||||
for error in result.stats.errors:
|
||||
print(f" - {error}")
|
||||
|
||||
if not result.results:
|
||||
print("No results found")
|
||||
else:
|
||||
print(f"Found {len(result.results)} results")
|
||||
```
|
||||
|
||||
## Cleanup
|
||||
|
||||
```python
|
||||
registry.close() # Close when done
|
||||
```
|
||||
@@ -1,676 +0,0 @@
|
||||
# Codexlens LSP API 规范
|
||||
|
||||
**版本**: 1.1
|
||||
**状态**: ✅ APPROVED (Gemini Review)
|
||||
**架构**: codexlens 提供 Python API,CCW 实现 MCP 端点
|
||||
**分析来源**: Gemini (架构评审) + Codex (实现评审)
|
||||
**最后更新**: 2025-01-17
|
||||
|
||||
---
|
||||
|
||||
## 一、概述
|
||||
|
||||
### 1.1 背景
|
||||
|
||||
基于 cclsp MCP 服务器实现的分析,设计 codexlens 的 LSP 搜索方法接口,为 AI 提供代码智能能力。
|
||||
|
||||
### 1.2 架构决策
|
||||
|
||||
**MCP 端点由 CCW 实现,codexlens 只提供 Python API**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Claude Code │
|
||||
│ ┌───────────────────────────────────────────────────────┐ │
|
||||
│ │ MCP Client │ │
|
||||
│ └───────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌───────────────────────────────────────────────────────┐ │
|
||||
│ │ CCW MCP Server │ │
|
||||
│ │ ┌─────────────────────────────────────────────────┐ │ │
|
||||
│ │ │ MCP Tool Handlers │ │ │
|
||||
│ │ │ • codexlens_file_context │ │ │
|
||||
│ │ │ • codexlens_find_definition │ │ │
|
||||
│ │ │ • codexlens_find_references │ │ │
|
||||
│ │ │ • codexlens_semantic_search │ │ │
|
||||
│ │ └──────────────────────┬──────────────────────────┘ │ │
|
||||
│ └─────────────────────────┼─────────────────────────────┘ │
|
||||
└────────────────────────────┼────────────────────────────────┘
|
||||
│ Python API 调用
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ codexlens │
|
||||
│ ┌───────────────────────────────────────────────────────┐ │
|
||||
│ │ Public API Layer │ │
|
||||
│ │ codexlens.api.file_context() │ │
|
||||
│ │ codexlens.api.find_definition() │ │
|
||||
│ │ codexlens.api.find_references() │ │
|
||||
│ │ codexlens.api.semantic_search() │ │
|
||||
│ └──────────────────────┬────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────────▼────────────────────────────────┐ │
|
||||
│ │ Core Components │ │
|
||||
│ │ GlobalSymbolIndex | ChainSearchEngine | HoverProvider │ │
|
||||
│ └───────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────────▼────────────────────────────────┐ │
|
||||
│ │ SQLite Index Databases │ │
|
||||
│ │ global_symbols.db | *.index.db (per-directory) │ │
|
||||
│ └───────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 1.3 职责分离
|
||||
|
||||
| 组件 | 职责 |
|
||||
|------|------|
|
||||
| **codexlens** | Python API、索引查询、搜索算法、结果聚合、降级处理 |
|
||||
| **CCW** | MCP 协议、参数校验、结果序列化、错误处理、project_root 推断 |
|
||||
|
||||
### 1.4 codexlens vs cclsp 对比
|
||||
|
||||
| 特性 | cclsp | codexlens |
|
||||
|------|-------|-----------|
|
||||
| 数据源 | 实时 LSP 服务器 | 预建 SQLite 索引 |
|
||||
| 启动时间 | 200-3000ms | <50ms |
|
||||
| 响应时间 | 50-500ms | <5ms |
|
||||
| 跨语言 | 每语言需要 LSP 服务器 | 统一 Python/TS/JS/Go 索引 |
|
||||
| 依赖 | 需要语言服务器 | 无外部依赖 |
|
||||
| 准确度 | 100% (编译器级) | 95%+ (tree-sitter) |
|
||||
| 重命名支持 | 是 | 否 (只读索引) |
|
||||
| 实时诊断 | 是 | 通过 IDE MCP |
|
||||
|
||||
**推荐**: codexlens 用于快速搜索,cclsp 用于精确重构
|
||||
|
||||
---
|
||||
|
||||
## 二、cclsp 设计模式 (参考)
|
||||
|
||||
### 2.1 MCP 工具接口设计
|
||||
|
||||
| 模式 | 说明 | 代码位置 |
|
||||
|------|------|----------|
|
||||
| **基于名称** | 接受 `symbol_name` 而非文件坐标 | `index.ts:70` |
|
||||
| **安全消歧义** | `rename_symbol` → `rename_symbol_strict` 两步 | `index.ts:133, 172` |
|
||||
| **复杂性抽象** | 隐藏 LSP 协议细节 | `index.ts:211` |
|
||||
| **优雅失败** | 返回有用的文本响应 | 全局 |
|
||||
|
||||
### 2.2 符号解析算法
|
||||
|
||||
```
|
||||
1. getDocumentSymbols (lsp-client.ts:1406)
|
||||
└─ 获取文件所有符号
|
||||
|
||||
2. 处理两种格式:
|
||||
├─ DocumentSymbol[] → 扁平化
|
||||
└─ SymbolInformation[] → 二次定位
|
||||
|
||||
3. 过滤: symbol.name === symbolName && symbol.kind
|
||||
|
||||
4. 回退: 无结果时移除 kind 约束重试
|
||||
|
||||
5. 聚合: 遍历所有匹配,聚合定义位置
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 三、需求规格
|
||||
|
||||
### 需求 1: 文件上下文查询 (`file_context`)
|
||||
|
||||
**用途**: 读取代码文件,返回文件中所有方法的调用关系摘要
|
||||
|
||||
**输出示例**:
|
||||
```markdown
|
||||
## src/auth/login.py (3 methods)
|
||||
|
||||
### login_user (line 15-45)
|
||||
- Calls: validate_password (auth/utils.py:23), create_session (session/manager.py:89)
|
||||
- Called by: handle_login_request (api/routes.py:156), test_login (tests/test_auth.py:34)
|
||||
|
||||
### validate_token (line 47-62)
|
||||
- Calls: decode_jwt (auth/jwt.py:12)
|
||||
- Called by: auth_middleware (middleware/auth.py:28)
|
||||
```
|
||||
|
||||
### 需求 2: 通用 LSP 搜索 (cclsp 兼容)
|
||||
|
||||
| 端点 | 用途 |
|
||||
|------|------|
|
||||
| `find_definition` | 根据符号名查找定义位置 |
|
||||
| `find_references` | 查找符号的所有引用 |
|
||||
| `workspace_symbols` | 工作区符号搜索 |
|
||||
| `get_hover` | 获取符号悬停信息 |
|
||||
|
||||
### 需求 3: 向量 + LSP 融合搜索
|
||||
|
||||
**用途**: 结合向量语义搜索和结构化 LSP 搜索
|
||||
|
||||
**融合策略**:
|
||||
- **RRF** (首选): 简单、不需要分数归一化、鲁棒
|
||||
- **Cascade**: 特定场景,先向量后 LSP
|
||||
- **Adaptive**: 长期目标,按查询类型自动选择
|
||||
|
||||
---
|
||||
|
||||
## 四、API 规范
|
||||
|
||||
### 4.1 模块结构
|
||||
|
||||
```
|
||||
src/codexlens/
|
||||
├─ api/ [新增] 公开 API 层
|
||||
│ ├─ __init__.py 导出所有 API
|
||||
│ ├─ file_context.py 文件上下文
|
||||
│ ├─ definition.py 定义查找
|
||||
│ ├─ references.py 引用查找
|
||||
│ ├─ symbols.py 符号搜索
|
||||
│ ├─ hover.py 悬停信息
|
||||
│ └─ semantic.py 语义搜索
|
||||
│
|
||||
├─ storage/
|
||||
│ ├─ global_index.py [扩展] get_file_symbols()
|
||||
│ └─ relationship_query.py [新增] 有向调用查询
|
||||
│
|
||||
└─ search/
|
||||
└─ chain_search.py [修复] schema 兼容
|
||||
```
|
||||
|
||||
### 4.2 `codexlens.api.file_context()`
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
|
||||
@dataclass
|
||||
class CallInfo:
|
||||
"""调用关系信息"""
|
||||
symbol_name: str
|
||||
file_path: Optional[str] # 目标文件 (可能为 None)
|
||||
line: int
|
||||
relationship: str # call | import | inheritance
|
||||
|
||||
@dataclass
|
||||
class MethodContext:
|
||||
"""方法上下文"""
|
||||
name: str
|
||||
kind: str # function | method | class
|
||||
line_range: Tuple[int, int]
|
||||
signature: Optional[str]
|
||||
calls: List[CallInfo] # 出向调用
|
||||
callers: List[CallInfo] # 入向调用
|
||||
|
||||
@dataclass
|
||||
class FileContextResult:
|
||||
"""文件上下文结果"""
|
||||
file_path: str
|
||||
language: str
|
||||
methods: List[MethodContext]
|
||||
summary: str # 人类可读摘要
|
||||
discovery_status: Dict[str, bool] = field(default_factory=lambda: {
|
||||
"outgoing_resolved": False,
|
||||
"incoming_resolved": True,
|
||||
"targets_resolved": False
|
||||
})
|
||||
|
||||
def file_context(
|
||||
project_root: str,
|
||||
file_path: str,
|
||||
include_calls: bool = True,
|
||||
include_callers: bool = True,
|
||||
max_depth: int = 1,
|
||||
format: str = "brief" # brief | detailed | tree
|
||||
) -> FileContextResult:
|
||||
"""
|
||||
获取代码文件的方法调用上下文。
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录 (用于定位索引)
|
||||
file_path: 代码文件路径
|
||||
include_calls: 是否包含出向调用
|
||||
include_callers: 是否包含入向调用
|
||||
max_depth: 调用链深度 (1=直接调用)
|
||||
⚠️ V1 限制: 当前版本仅支持 max_depth=1
|
||||
深度调用链分析将在 V2 实现
|
||||
format: 输出格式
|
||||
|
||||
Returns:
|
||||
FileContextResult
|
||||
|
||||
Raises:
|
||||
IndexNotFoundError: 项目未索引
|
||||
FileNotFoundError: 文件不存在
|
||||
|
||||
Note:
|
||||
V1 实现限制:
|
||||
- max_depth 仅支持 1 (直接调用)
|
||||
- 出向调用目标文件可能为 None (未解析)
|
||||
- 深度调用链分析作为 V2 特性规划
|
||||
"""
|
||||
```
|
||||
|
||||
### 4.3 `codexlens.api.find_definition()`
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class DefinitionResult:
|
||||
"""定义查找结果"""
|
||||
name: str
|
||||
kind: str
|
||||
file_path: str
|
||||
line: int
|
||||
end_line: int
|
||||
signature: Optional[str]
|
||||
container: Optional[str] # 所属类/模块
|
||||
score: float
|
||||
|
||||
def find_definition(
|
||||
project_root: str,
|
||||
symbol_name: str,
|
||||
symbol_kind: Optional[str] = None,
|
||||
file_context: Optional[str] = None,
|
||||
limit: int = 10
|
||||
) -> List[DefinitionResult]:
|
||||
"""
|
||||
根据符号名称查找定义位置。
|
||||
|
||||
Fallback 策略:
|
||||
1. 精确匹配 + kind 过滤
|
||||
2. 精确匹配 (移除 kind)
|
||||
3. 前缀匹配
|
||||
"""
|
||||
```
|
||||
|
||||
### 4.4 `codexlens.api.find_references()`
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class ReferenceResult:
|
||||
"""引用结果"""
|
||||
file_path: str
|
||||
line: int
|
||||
column: int
|
||||
context_line: str
|
||||
relationship: str # call | import | type_annotation | inheritance
|
||||
|
||||
@dataclass
|
||||
class GroupedReferences:
|
||||
"""按定义分组的引用"""
|
||||
definition: DefinitionResult
|
||||
references: List[ReferenceResult]
|
||||
|
||||
def find_references(
|
||||
project_root: str,
|
||||
symbol_name: str,
|
||||
symbol_kind: Optional[str] = None,
|
||||
include_definition: bool = True,
|
||||
group_by_definition: bool = True,
|
||||
limit: int = 100
|
||||
) -> List[GroupedReferences]:
|
||||
"""
|
||||
查找符号的所有引用位置。
|
||||
|
||||
多定义时分组返回,解决引用混淆问题。
|
||||
"""
|
||||
```
|
||||
|
||||
### 4.5 `codexlens.api.workspace_symbols()`
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class SymbolInfo:
|
||||
"""符号信息"""
|
||||
name: str
|
||||
kind: str
|
||||
file_path: str
|
||||
line: int
|
||||
container: Optional[str]
|
||||
score: float
|
||||
|
||||
def workspace_symbols(
|
||||
project_root: str,
|
||||
query: str,
|
||||
kind_filter: Optional[List[str]] = None,
|
||||
file_pattern: Optional[str] = None,
|
||||
limit: int = 50
|
||||
) -> List[SymbolInfo]:
|
||||
"""在整个工作区搜索符号 (前缀匹配)。"""
|
||||
```
|
||||
|
||||
### 4.6 `codexlens.api.get_hover()`
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class HoverInfo:
|
||||
"""悬停信息"""
|
||||
name: str
|
||||
kind: str
|
||||
signature: str
|
||||
documentation: Optional[str]
|
||||
file_path: str
|
||||
line_range: Tuple[int, int]
|
||||
type_info: Optional[str]
|
||||
|
||||
def get_hover(
|
||||
project_root: str,
|
||||
symbol_name: str,
|
||||
file_path: Optional[str] = None
|
||||
) -> Optional[HoverInfo]:
|
||||
"""获取符号的详细悬停信息。"""
|
||||
```
|
||||
|
||||
### 4.7 `codexlens.api.semantic_search()`
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class SemanticResult:
|
||||
"""语义搜索结果"""
|
||||
symbol_name: str
|
||||
kind: str
|
||||
file_path: str
|
||||
line: int
|
||||
vector_score: Optional[float]
|
||||
structural_score: Optional[float]
|
||||
fusion_score: float
|
||||
snippet: str
|
||||
match_reason: Optional[str]
|
||||
|
||||
def semantic_search(
|
||||
project_root: str,
|
||||
query: str,
|
||||
mode: str = "fusion", # vector | structural | fusion
|
||||
vector_weight: float = 0.5,
|
||||
structural_weight: float = 0.3,
|
||||
keyword_weight: float = 0.2,
|
||||
fusion_strategy: str = "rrf", # rrf | staged | binary | hybrid
|
||||
kind_filter: Optional[List[str]] = None,
|
||||
limit: int = 20,
|
||||
include_match_reason: bool = False
|
||||
) -> List[SemanticResult]:
|
||||
"""
|
||||
语义搜索 - 结合向量和结构化搜索。
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录
|
||||
query: 自然语言查询
|
||||
mode: 搜索模式
|
||||
- vector: 仅向量搜索
|
||||
- structural: 仅结构搜索 (符号 + 关系)
|
||||
- fusion: 融合搜索 (默认)
|
||||
vector_weight: 向量搜索权重 [0, 1]
|
||||
structural_weight: 结构搜索权重 [0, 1]
|
||||
keyword_weight: 关键词搜索权重 [0, 1]
|
||||
fusion_strategy: 融合策略 (映射到 chain_search.py)
|
||||
- rrf: Reciprocal Rank Fusion (推荐,默认)
|
||||
- staged: 分阶段级联 → staged_cascade_search
|
||||
- binary: 二分重排级联 → binary_rerank_cascade_search
|
||||
- hybrid: 混合级联 → hybrid_search
|
||||
kind_filter: 符号类型过滤
|
||||
limit: 最大返回数量
|
||||
include_match_reason: 是否生成匹配原因 (启发式,非 LLM)
|
||||
|
||||
Returns:
|
||||
按 fusion_score 排序的结果列表
|
||||
|
||||
降级行为:
|
||||
- 无向量索引: vector_score=None, 使用 FTS + 结构搜索
|
||||
- 无关系数据: structural_score=None, 仅向量搜索
|
||||
"""
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 五、已知问题与解决方案
|
||||
|
||||
### 5.1 P0 阻塞项
|
||||
|
||||
| 问题 | 位置 | 解决方案 |
|
||||
|------|------|----------|
|
||||
| **索引 Schema 不匹配** | `chain_search.py:313-324` vs `dir_index.py:304-312` | 兼容 `full_path` 和 `path` |
|
||||
| **文件符号查询缺失** | `global_index.py:214-260` | 新增 `get_file_symbols()` |
|
||||
| **出向调用查询缺失** | `dir_index.py:333-342` | 新增 `RelationshipQuery` |
|
||||
| **关系类型不一致** | `entities.py:74-79` | 规范化 `calls` → `call` |
|
||||
|
||||
### 5.2 设计缺陷 (Gemini 发现)
|
||||
|
||||
| 缺陷 | 影响 | 解决方案 |
|
||||
|------|------|----------|
|
||||
| **调用图不完整** | `file_context` 缺少出向调用 | 新增有向调用 API |
|
||||
| **消歧义未定义** | 多定义时无法区分 | 实现 `rank_by_proximity()` |
|
||||
| **AI 特性成本过高** | `explanation` 需要 LLM | 设为可选,默认关闭 |
|
||||
| **融合参数不一致** | 3 分支但只有 2 权重 | 补充 `keyword_weight` |
|
||||
|
||||
### 5.3 消歧义算法
|
||||
|
||||
**V1 实现** (基于文件路径接近度):
|
||||
|
||||
```python
|
||||
def rank_by_proximity(
|
||||
results: List[DefinitionResult],
|
||||
file_context: str
|
||||
) -> List[DefinitionResult]:
|
||||
"""按文件接近度排序 (V1: 路径接近度)"""
|
||||
def proximity_score(result):
|
||||
# 1. 同目录最高分
|
||||
if os.path.dirname(result.file_path) == os.path.dirname(file_context):
|
||||
return 100
|
||||
# 2. 共同路径前缀长度
|
||||
common = os.path.commonpath([result.file_path, file_context])
|
||||
return len(common)
|
||||
|
||||
return sorted(results, key=proximity_score, reverse=True)
|
||||
```
|
||||
|
||||
**V2 增强计划** (基于 import graph 距离):
|
||||
|
||||
```python
|
||||
def rank_by_import_distance(
|
||||
results: List[DefinitionResult],
|
||||
file_context: str,
|
||||
import_graph: Dict[str, Set[str]]
|
||||
) -> List[DefinitionResult]:
|
||||
"""按 import graph 距离排序 (V2)"""
|
||||
def import_distance(result):
|
||||
# BFS 计算最短 import 路径
|
||||
return bfs_shortest_path(
|
||||
import_graph,
|
||||
file_context,
|
||||
result.file_path
|
||||
)
|
||||
|
||||
# 组合: 0.6 * import_distance + 0.4 * path_proximity
|
||||
return sorted(results, key=lambda r: (
|
||||
0.6 * import_distance(r) +
|
||||
0.4 * (100 - proximity_score(r))
|
||||
))
|
||||
```
|
||||
|
||||
### 5.4 参考实现: `get_file_symbols()`
|
||||
|
||||
**位置**: `src/codexlens/storage/global_index.py`
|
||||
|
||||
```python
|
||||
def get_file_symbols(self, file_path: str | Path) -> List[Symbol]:
|
||||
"""
|
||||
获取指定文件中定义的所有符号。
|
||||
|
||||
Args:
|
||||
file_path: 文件路径 (相对或绝对)
|
||||
|
||||
Returns:
|
||||
按行号排序的符号列表
|
||||
"""
|
||||
file_path_str = str(Path(file_path).resolve())
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
|
||||
FROM global_symbols
|
||||
WHERE project_id = ? AND file_path = ?
|
||||
ORDER BY start_line
|
||||
""",
|
||||
(self.project_id, file_path_str),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
Symbol(
|
||||
name=row["symbol_name"],
|
||||
kind=row["symbol_kind"],
|
||||
range=(row["start_line"], row["end_line"]),
|
||||
file=row["file_path"],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 六、实现计划
|
||||
|
||||
### Phase 0: 基础设施 (16h)
|
||||
|
||||
| 任务 | 工时 | 说明 |
|
||||
|------|------|------|
|
||||
| 修复 `search_references` schema | 4h | 兼容两种 schema |
|
||||
| 新增 `GlobalSymbolIndex.get_file_symbols()` | 4h | 文件符号查询 (见 5.4) |
|
||||
| 新增 `RelationshipQuery` 类 | 6h | 有向调用查询 |
|
||||
| 关系类型规范化层 | 2h | `calls` → `call` |
|
||||
|
||||
### Phase 1: API 层 (48h)
|
||||
|
||||
| 任务 | 工时 | 复杂度 |
|
||||
|------|------|--------|
|
||||
| `find_definition()` | 4h | S |
|
||||
| `find_references()` | 8h | M |
|
||||
| `workspace_symbols()` | 4h | S |
|
||||
| `get_hover()` | 4h | S |
|
||||
| `file_context()` | 16h | L |
|
||||
| `semantic_search()` | 12h | M |
|
||||
|
||||
### Phase 2: 测试与文档 (16h)
|
||||
|
||||
| 任务 | 工时 |
|
||||
|------|------|
|
||||
| 单元测试 (≥80%) | 8h |
|
||||
| API 文档 | 4h |
|
||||
| 示例代码 | 4h |
|
||||
|
||||
### 关键路径
|
||||
|
||||
```
|
||||
Phase 0.1 (schema fix)
|
||||
↓
|
||||
Phase 0.2 (file symbols) → Phase 1.5 (file_context)
|
||||
↓
|
||||
Phase 1 (其他 API)
|
||||
↓
|
||||
Phase 2 (测试)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 七、测试策略
|
||||
|
||||
### 7.1 单元测试
|
||||
|
||||
```python
|
||||
# test_global_index.py
|
||||
def test_get_file_symbols():
|
||||
index = GlobalSymbolIndex(":memory:")
|
||||
index.update_file_symbols(project_id=1, file_path="test.py", symbols=[...])
|
||||
results = index.get_file_symbols("test.py")
|
||||
assert len(results) == 3
|
||||
|
||||
# test_relationship_query.py
|
||||
def test_outgoing_calls():
|
||||
store = DirIndexStore(":memory:")
|
||||
calls = store.get_outgoing_calls("src/auth.py", "login")
|
||||
assert calls[0].relationship == "call" # 已规范化
|
||||
```
|
||||
|
||||
### 7.2 Schema 兼容性测试
|
||||
|
||||
```python
|
||||
def test_search_references_both_schemas():
|
||||
"""测试两种 schema 的引用搜索"""
|
||||
# 旧 schema: files(path, ...)
|
||||
# 新 schema: files(full_path, ...)
|
||||
```
|
||||
|
||||
### 7.3 降级测试
|
||||
|
||||
```python
|
||||
def test_semantic_search_without_vectors():
|
||||
result = semantic_search(query="auth", mode="fusion")
|
||||
assert result.vector_score is None
|
||||
assert result.fusion_score > 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 八、使用示例
|
||||
|
||||
```python
|
||||
from codexlens.api import (
|
||||
file_context,
|
||||
find_definition,
|
||||
find_references,
|
||||
semantic_search
|
||||
)
|
||||
|
||||
# 1. 获取文件上下文
|
||||
result = file_context(
|
||||
project_root="/path/to/project",
|
||||
file_path="src/auth/login.py",
|
||||
format="brief"
|
||||
)
|
||||
print(result.summary)
|
||||
|
||||
# 2. 查找定义
|
||||
definitions = find_definition(
|
||||
project_root="/path/to/project",
|
||||
symbol_name="UserService",
|
||||
symbol_kind="class"
|
||||
)
|
||||
|
||||
# 3. 语义搜索
|
||||
results = semantic_search(
|
||||
project_root="/path/to/project",
|
||||
query="处理用户登录验证的函数",
|
||||
mode="fusion"
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 九、CCW 集成
|
||||
|
||||
| codexlens API | CCW MCP Tool |
|
||||
|---------------|--------------|
|
||||
| `file_context()` | `codexlens_file_context` |
|
||||
| `find_definition()` | `codexlens_find_definition` |
|
||||
| `find_references()` | `codexlens_find_references` |
|
||||
| `workspace_symbols()` | `codexlens_workspace_symbol` |
|
||||
| `get_hover()` | `codexlens_get_hover` |
|
||||
| `semantic_search()` | `codexlens_semantic_search` |
|
||||
|
||||
---
|
||||
|
||||
## 十、分析来源
|
||||
|
||||
| 工具 | Session ID | 贡献 |
|
||||
|------|------------|------|
|
||||
| Gemini | `1768618654438-gemini` | 架构评审、设计缺陷、融合策略 |
|
||||
| Codex | `1768618658183-codex` | 组件复用、复杂度估算、任务分解 |
|
||||
| Gemini | `1768620615744-gemini` | 最终评审、改进建议、APPROVED |
|
||||
|
||||
---
|
||||
|
||||
## 十一、版本历史
|
||||
|
||||
| 版本 | 日期 | 变更 |
|
||||
|------|------|------|
|
||||
| 1.0 | 2025-01-17 | 初始版本,合并多文档 |
|
||||
| 1.1 | 2025-01-17 | 应用 Gemini 评审改进: V1 限制说明、策略映射、消歧义增强、参考实现 |
|
||||
@@ -1,326 +0,0 @@
|
||||
# CodexLens Auto Hybrid Mode - Implementation Summary
|
||||
|
||||
## 概述
|
||||
|
||||
实现了两个主要功能:
|
||||
1. **自动向量嵌入生成**:`init` 命令在检测到语义搜索依赖后自动生成向量嵌入
|
||||
2. **默认混合搜索模式**:`search` 命令在检测到嵌入存在时自动使用 hybrid 模式
|
||||
|
||||
## 修改文件
|
||||
|
||||
### 1. codex-lens CLI (`codex-lens/src/codexlens/cli/commands.py`)
|
||||
|
||||
#### 1.1 `init` 命令增强
|
||||
|
||||
**新增参数**:
|
||||
- `--no-embeddings`: 跳过自动嵌入生成
|
||||
- `--embedding-model`: 指定嵌入模型 (默认: "code")
|
||||
|
||||
**自动嵌入生成逻辑**:
|
||||
```python
|
||||
# 在 init 成功后
|
||||
if not no_embeddings:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if SEMANTIC_AVAILABLE:
|
||||
# 自动调用 generate_embeddings()
|
||||
# 使用指定的 embedding_model
|
||||
```
|
||||
|
||||
**行为**:
|
||||
- 检测 `fastembed` 和 `numpy` 是否安装
|
||||
- 如果可用,自动生成嵌入(可用 `--no-embeddings` 跳过)
|
||||
- 默认使用 "code" 模型 (jinaai/jina-embeddings-v2-base-code)
|
||||
- 在输出中显示嵌入生成进度和统计
|
||||
|
||||
#### 1.2 `search` 命令增强
|
||||
|
||||
**模式变更**:
|
||||
- 默认模式从 `"exact"` 改为 `"auto"`
|
||||
- 新增 `"auto"` 模式到有效模式列表
|
||||
|
||||
**自动模式检测逻辑**:
|
||||
```python
|
||||
if mode == "auto":
|
||||
# 检查项目是否有嵌入
|
||||
project_record = registry.find_by_source_path(str(search_path))
|
||||
if project_record:
|
||||
embed_status = check_embeddings_status(index_path)
|
||||
if has_embeddings:
|
||||
actual_mode = "hybrid" # 使用混合模式
|
||||
else:
|
||||
actual_mode = "exact" # 降级到精确模式
|
||||
```
|
||||
|
||||
**行为**:
|
||||
- 默认使用 `auto` 模式
|
||||
- 自动检测索引是否有嵌入
|
||||
- 有嵌入 → 使用 `hybrid` 模式(精确 + 模糊 + 向量融合)
|
||||
- 无嵌入 → 使用 `exact` 模式(仅全文搜索)
|
||||
- 用户仍可手动指定模式覆盖自动检测
|
||||
|
||||
### 2. MCP 工具简化 (`ccw/src/tools/codex-lens.ts`)
|
||||
|
||||
#### 2.1 简化 action 枚举
|
||||
|
||||
**仅暴露核心操作**:
|
||||
- `init`: 初始化索引(自动生成嵌入)
|
||||
- `search`: 搜索代码(自动混合模式)
|
||||
- `search_files`: 搜索文件路径
|
||||
|
||||
**移除的高级操作**(仍可通过 CLI 使用):
|
||||
- ~~`symbol`~~: 符号提取 → 使用 `codexlens symbol`
|
||||
- ~~`status`~~: 状态检查 → 使用 `codexlens status`
|
||||
- ~~`config_show/set/migrate`~~: 配置管理 → 使用 `codexlens config`
|
||||
- ~~`clean`~~: 清理索引 → 使用 `codexlens clean`
|
||||
- ~~`bootstrap/check`~~: 安装管理 → 自动处理
|
||||
|
||||
**简化的 ParamsSchema**:
|
||||
```typescript
|
||||
const ParamsSchema = z.object({
|
||||
action: z.enum(['init', 'search', 'search_files']),
|
||||
path: z.string().optional(),
|
||||
query: z.string().optional(),
|
||||
mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto'),
|
||||
languages: z.array(z.string()).optional(),
|
||||
limit: z.number().default(20),
|
||||
});
|
||||
```
|
||||
|
||||
#### 2.2 扩展 mode 枚举并设置默认值
|
||||
|
||||
**模式支持**:
|
||||
```typescript
|
||||
mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto')
|
||||
```
|
||||
|
||||
**模式映射**(MCP → CLI):
|
||||
```typescript
|
||||
const modeMap: Record<string, string> = {
|
||||
'text': 'exact',
|
||||
'semantic': 'pure-vector',
|
||||
'auto': 'auto', // 默认:自动检测
|
||||
'exact': 'exact',
|
||||
'fuzzy': 'fuzzy',
|
||||
'hybrid': 'hybrid',
|
||||
'vector': 'vector',
|
||||
'pure-vector': 'pure-vector',
|
||||
};
|
||||
```
|
||||
|
||||
#### 2.3 传递 mode 参数到 CLI
|
||||
|
||||
```typescript
|
||||
const args = ['search', query, '--limit', limit.toString(), '--mode', cliMode, '--json'];
|
||||
```
|
||||
|
||||
### 3. 文档更新 (`.claude/rules/context-requirements.md`)
|
||||
|
||||
#### 3.1 更新 init 说明
|
||||
|
||||
强调自动嵌入生成功能:
|
||||
```markdown
|
||||
**NEW**: `init` automatically generates vector embeddings if semantic dependencies are installed (fastembed).
|
||||
- Auto-detects if `numpy` and `fastembed` are available
|
||||
- Uses "code" model by default (jinaai/jina-embeddings-v2-base-code)
|
||||
- Skip with `--no-embeddings` flag if needed
|
||||
```
|
||||
|
||||
#### 3.2 更新 search 说明
|
||||
|
||||
强调自动混合模式:
|
||||
```markdown
|
||||
**Search Code** (Auto Hybrid Mode - DEFAULT):
|
||||
# Simple call - auto-detects mode (hybrid if embeddings exist, exact otherwise):
|
||||
codex_lens(action="search", query="authentication", path=".", limit=20)
|
||||
```
|
||||
|
||||
#### 3.3 详细模式说明
|
||||
|
||||
添加完整的模式列表和默认行为说明:
|
||||
- `auto`: **DEFAULT** - Uses hybrid if embeddings exist, exact otherwise
|
||||
- `hybrid`: Exact + Fuzzy + Vector fusion (best results, auto-selected if embeddings exist)
|
||||
- 其他模式...
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 场景 1:首次使用(已安装 fastembed)
|
||||
|
||||
```bash
|
||||
# 初始化索引(自动生成嵌入)
|
||||
codexlens init .
|
||||
|
||||
# 输出:
|
||||
# OK Indexed 150 files in 12 directories
|
||||
#
|
||||
# Generating embeddings...
|
||||
# Model: code
|
||||
# ✓ Generated 1234 embeddings in 45.2s
|
||||
|
||||
# 搜索(自动使用 hybrid 模式)
|
||||
codexlens search "authentication"
|
||||
# Mode: hybrid | Searched 12 directories in 15.2ms
|
||||
```
|
||||
|
||||
### 场景 2:首次使用(未安装 fastembed)
|
||||
|
||||
```bash
|
||||
# 初始化索引(跳过嵌入)
|
||||
codexlens init .
|
||||
|
||||
# 输出:
|
||||
# OK Indexed 150 files in 12 directories
|
||||
# (无嵌入生成提示)
|
||||
|
||||
# 搜索(降级到 exact 模式)
|
||||
codexlens search "authentication"
|
||||
# Mode: exact | Searched 12 directories in 8.5ms
|
||||
```
|
||||
|
||||
### 场景 3:手动控制
|
||||
|
||||
```bash
|
||||
# 跳过嵌入生成
|
||||
codexlens init . --no-embeddings
|
||||
|
||||
# 强制使用特定模式
|
||||
codexlens search "auth" --mode exact
|
||||
codexlens search "how to authenticate" --mode hybrid
|
||||
```
|
||||
|
||||
### 场景 4:MCP 工具使用(简化版)
|
||||
|
||||
```python
|
||||
# 初始化(自动生成嵌入)
|
||||
codex_lens(action="init", path=".")
|
||||
|
||||
# 搜索(默认 auto 模式:有嵌入用 hybrid,无嵌入用 exact)
|
||||
codex_lens(action="search", query="authentication")
|
||||
|
||||
# 强制混合模式
|
||||
codex_lens(action="search", query="authentication", mode="hybrid")
|
||||
|
||||
# 强制精确模式
|
||||
codex_lens(action="search", query="authenticate_user", mode="exact")
|
||||
|
||||
# 仅返回文件路径
|
||||
codex_lens(action="search_files", query="payment processing")
|
||||
```
|
||||
|
||||
**高级操作使用 CLI**:
|
||||
```bash
|
||||
# 检查状态
|
||||
codexlens status
|
||||
|
||||
# 提取符号
|
||||
codexlens symbol src/auth/login.js
|
||||
|
||||
# 配置管理
|
||||
codexlens config show
|
||||
codexlens config set index_dir /custom/path
|
||||
|
||||
# 清理索引
|
||||
codexlens clean .
|
||||
```
|
||||
|
||||
## 技术细节
|
||||
|
||||
### 嵌入检测逻辑
|
||||
|
||||
1. 查找项目在 registry 中的记录
|
||||
2. 获取索引路径 `index_root/_index.db`
|
||||
3. 调用 `check_embeddings_status()` 检查:
|
||||
- 是否存在 `chunks` 表
|
||||
- `chunks_count > 0`
|
||||
4. 根据检测结果选择模式
|
||||
|
||||
### 混合搜索权重
|
||||
|
||||
默认 RRF 权重:
|
||||
- Exact FTS: 0.4
|
||||
- Fuzzy FTS: 0.3
|
||||
- Vector: 0.3
|
||||
|
||||
可通过 `--weights` 参数自定义:
|
||||
```bash
|
||||
codexlens search "query" --mode hybrid --weights 0.5,0.3,0.2
|
||||
```
|
||||
|
||||
### 模型选项
|
||||
|
||||
| 模型 | 模型名称 | 维度 | 大小 | 推荐场景 |
|
||||
|------|---------|------|------|---------|
|
||||
| fast | BAAI/bge-small-en-v1.5 | 384 | ~80MB | 快速原型 |
|
||||
| code | jinaai/jina-embeddings-v2-base-code | 768 | ~150MB | **推荐** 代码搜索 |
|
||||
| multilingual | intfloat/multilingual-e5-large | 1024 | ~1GB | 多语言项目 |
|
||||
| balanced | mixedbread-ai/mxbai-embed-large-v1 | 1024 | ~600MB | 平衡性能 |
|
||||
|
||||
## 兼容性
|
||||
|
||||
### 向后兼容
|
||||
|
||||
- 所有现有命令仍然工作
|
||||
- 手动指定 `--mode` 会覆盖自动检测
|
||||
- 使用 `--no-embeddings` 可恢复旧行为
|
||||
|
||||
### 依赖要求
|
||||
|
||||
**核心功能**(无需额外依赖):
|
||||
- FTS 索引(exact, fuzzy)
|
||||
- 符号提取
|
||||
|
||||
**语义搜索功能**(需要安装):
|
||||
```bash
|
||||
pip install codexlens[semantic]
|
||||
# 或
|
||||
pip install numpy fastembed
|
||||
```
|
||||
|
||||
## 性能影响
|
||||
|
||||
### 初始化时间
|
||||
|
||||
- FTS 索引:~2-5 秒(100 文件)
|
||||
- 嵌入生成:+30-60 秒(首次下载模型)
|
||||
- 后续嵌入:+10-20 秒
|
||||
|
||||
### 搜索性能
|
||||
|
||||
| 模式 | 延迟 | 召回率 | 推荐场景 |
|
||||
|------|------|--------|---------|
|
||||
| exact | 5ms | 中 | 精确代码标识符 |
|
||||
| fuzzy | 7ms | 中 | 容错搜索 |
|
||||
| hybrid | 15ms | **最高** | **通用搜索(推荐)** |
|
||||
| vector | 12ms | 高 | 语义查询 |
|
||||
| pure-vector | 10ms | 中 | 自然语言 |
|
||||
|
||||
## 最小化修改原则
|
||||
|
||||
所有修改都遵循最小化原则:
|
||||
1. **保持向后兼容**:不破坏现有功能
|
||||
2. **默认智能**:自动检测最佳模式
|
||||
3. **用户可控**:可通过参数覆盖自动行为
|
||||
4. **渐进增强**:未安装 fastembed 时优雅降级
|
||||
|
||||
## 总结
|
||||
|
||||
✅ **init 命令自动生成嵌入**(可用 `--no-embeddings` 跳过)
|
||||
✅ **search 命令默认使用混合模式**(有嵌入时自动启用)
|
||||
✅ **MCP 工具简化为核心操作**(init, search, search_files)
|
||||
✅ **所有搜索模式支持**(auto, exact, fuzzy, hybrid, vector, pure-vector)
|
||||
✅ **文档已更新**反映新的默认行为
|
||||
✅ **保持向后兼容性**
|
||||
✅ **优雅降级**(无 fastembed 时使用 exact 模式)
|
||||
|
||||
### MCP vs CLI 功能对比
|
||||
|
||||
| 功能 | MCP 工具 | CLI |
|
||||
|------|---------|-----|
|
||||
| 初始化索引 | ✅ `codex_lens(action="init")` | ✅ `codexlens init` |
|
||||
| 搜索代码 | ✅ `codex_lens(action="search")` | ✅ `codexlens search` |
|
||||
| 搜索文件 | ✅ `codex_lens(action="search_files")` | ✅ `codexlens search --files-only` |
|
||||
| 检查状态 | ❌ 使用 CLI | ✅ `codexlens status` |
|
||||
| 提取符号 | ❌ 使用 CLI | ✅ `codexlens symbol` |
|
||||
| 配置管理 | ❌ 使用 CLI | ✅ `codexlens config` |
|
||||
| 清理索引 | ❌ 使用 CLI | ✅ `codexlens clean` |
|
||||
|
||||
**设计理念**:MCP 工具专注于高频核心操作(索引、搜索),高级管理操作通过 CLI 执行。
|
||||
@@ -1,298 +0,0 @@
|
||||
# CodexLens 配置说明
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
~/.codexlens/ # 全局数据目录
|
||||
├── .env # 全局 API 配置 (新增)
|
||||
├── settings.json # 运行时设置
|
||||
├── embedding_lock.json # 模型锁定文件
|
||||
├── registry.db # 项目注册表
|
||||
├── indexes/ # 集中式索引存储
|
||||
└── venv/ # Python 虚拟环境
|
||||
|
||||
project/
|
||||
├── .codexlens/ # 工作区本地目录
|
||||
│ ├── .env # 工作区 API 配置 (覆盖全局)
|
||||
│ ├── index.db # 项目索引数据库
|
||||
│ ├── cache/ # 缓存目录
|
||||
│ └── .gitignore # 排除敏感文件
|
||||
└── .env # 项目根目录配置
|
||||
```
|
||||
|
||||
## 配置优先级
|
||||
|
||||
配置加载顺序 (后者覆盖前者):
|
||||
|
||||
| 优先级 | 位置 | 说明 |
|
||||
|--------|------|------|
|
||||
| 1 (最低) | `~/.codexlens/.env` | 全局默认配置 |
|
||||
| 2 | `project/.env` | 项目根目录配置 |
|
||||
| 3 | `project/.codexlens/.env` | 工作区本地配置 |
|
||||
| 4 (最高) | 环境变量 | Shell 环境变量 |
|
||||
|
||||
## 环境变量
|
||||
|
||||
### Embedding 配置
|
||||
|
||||
用于 `litellm` 后端的嵌入向量服务:
|
||||
|
||||
```bash
|
||||
# API 密钥
|
||||
EMBEDDING_API_KEY=your-api-key
|
||||
|
||||
# API 基础 URL
|
||||
EMBEDDING_API_BASE=https://api.example.com/v1
|
||||
|
||||
# 嵌入模型名称
|
||||
EMBEDDING_MODEL=text-embedding-3-small
|
||||
```
|
||||
|
||||
**支持的提供商示例**:
|
||||
|
||||
| 提供商 | API Base | 模型示例 |
|
||||
|--------|----------|----------|
|
||||
| OpenAI | `https://api.openai.com/v1` | `text-embedding-3-small` |
|
||||
| ModelScope | `https://api-inference.modelscope.cn/v1` | `Qwen/Qwen3-Embedding-8B` |
|
||||
| Azure | `https://your-resource.openai.azure.com` | `text-embedding-ada-002` |
|
||||
|
||||
### LiteLLM 配置
|
||||
|
||||
用于 LLM 功能 (重排序、语义分析等):
|
||||
|
||||
```bash
|
||||
# API 密钥
|
||||
LITELLM_API_KEY=your-api-key
|
||||
|
||||
# API 基础 URL
|
||||
LITELLM_API_BASE=https://api.example.com/v1
|
||||
|
||||
# 模型名称
|
||||
LITELLM_MODEL=gpt-4o-mini
|
||||
```
|
||||
|
||||
### Reranker 配置
|
||||
|
||||
用于搜索结果重排序 (可选):
|
||||
|
||||
```bash
|
||||
# API 密钥
|
||||
RERANKER_API_KEY=your-api-key
|
||||
|
||||
# API 基础 URL
|
||||
RERANKER_API_BASE=https://api.siliconflow.cn
|
||||
|
||||
# 提供商: siliconflow, cohere, jina
|
||||
RERANKER_PROVIDER=siliconflow
|
||||
|
||||
# 重排序模型
|
||||
RERANKER_MODEL=BAAI/bge-reranker-v2-m3
|
||||
```
|
||||
|
||||
### 通用配置
|
||||
|
||||
```bash
|
||||
# 自定义数据目录 (默认: ~/.codexlens)
|
||||
CODEXLENS_DATA_DIR=~/.codexlens
|
||||
|
||||
# 启用调试模式
|
||||
CODEXLENS_DEBUG=false
|
||||
```
|
||||
|
||||
## settings.json
|
||||
|
||||
运行时设置保存在 `~/.codexlens/settings.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"embedding": {
|
||||
"backend": "litellm",
|
||||
"model": "Qwen/Qwen3-Embedding-8B",
|
||||
"use_gpu": false,
|
||||
"endpoints": [
|
||||
{
|
||||
"model": "Qwen/Qwen3-Embedding-8B",
|
||||
"api_key": "${EMBEDDING_API_KEY}",
|
||||
"api_base": "${EMBEDDING_API_BASE}",
|
||||
"weight": 1.0
|
||||
}
|
||||
],
|
||||
"strategy": "latency_aware",
|
||||
"cooldown": 60.0
|
||||
},
|
||||
"llm": {
|
||||
"enabled": true,
|
||||
"tool": "gemini",
|
||||
"timeout_ms": 300000,
|
||||
"batch_size": 5
|
||||
},
|
||||
"parsing": {
|
||||
"use_astgrep": false
|
||||
},
|
||||
"indexing": {
|
||||
"static_graph_enabled": false,
|
||||
"static_graph_relationship_types": ["imports", "inherits"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Embedding 设置
|
||||
|
||||
| 字段 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `backend` | string | `fastembed` (本地) 或 `litellm` (API) |
|
||||
| `model` | string | 模型名称或配置文件 |
|
||||
| `use_gpu` | bool | GPU 加速 (仅 fastembed) |
|
||||
| `endpoints` | array | 多端点配置 (仅 litellm) |
|
||||
| `strategy` | string | 负载均衡策略 |
|
||||
| `cooldown` | float | 限流冷却时间 (秒) |
|
||||
|
||||
**Embedding Backend 对比**:
|
||||
|
||||
| 特性 | fastembed | litellm |
|
||||
|------|-----------|---------|
|
||||
| 运行方式 | 本地 ONNX | API 调用 |
|
||||
| 依赖 | 本地模型文件 | API 密钥 |
|
||||
| 速度 | 快 (本地) | 取决于网络 |
|
||||
| 模型选择 | 预定义配置文件 | 任意 API 模型 |
|
||||
| GPU 支持 | 是 | N/A |
|
||||
|
||||
**负载均衡策略**:
|
||||
|
||||
| 策略 | 说明 |
|
||||
|------|------|
|
||||
| `round_robin` | 轮询分配 |
|
||||
| `latency_aware` | 延迟感知 (推荐) |
|
||||
| `weighted_random` | 加权随机 |
|
||||
|
||||
### LLM 设置
|
||||
|
||||
| 字段 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `enabled` | bool | 启用 LLM 功能 |
|
||||
| `tool` | string | LLM 工具 (`gemini`, `codex`) |
|
||||
| `timeout_ms` | int | 超时时间 (毫秒) |
|
||||
| `batch_size` | int | 批处理大小 |
|
||||
|
||||
### Parsing 设置
|
||||
|
||||
| 字段 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `use_astgrep` | bool | 优先使用 ast-grep 解析关系(实验性;当前主要用于 Python relationships) |
|
||||
|
||||
### Indexing 设置(静态图)
|
||||
|
||||
| 字段 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `static_graph_enabled` | bool | 索引时将 relationships 写入全局 `global_relationships`,用于搜索阶段静态图扩展 |
|
||||
| `static_graph_relationship_types` | array | 允许持久化的关系类型:`imports` / `inherits` / `calls` |
|
||||
|
||||
**CLI 覆盖(单次运行,不写入 settings.json)**:
|
||||
|
||||
```bash
|
||||
# 索引时启用静态图 relationships + 使用 ast-grep(如果可用)
|
||||
codexlens index init --use-astgrep --static-graph --static-graph-types imports,inherits,calls
|
||||
```
|
||||
|
||||
**Search staged 静态图扩展(高级)**:
|
||||
|
||||
```bash
|
||||
codexlens search --cascade-strategy staged --staged-stage2-mode static_global_graph
|
||||
```
|
||||
|
||||
## FastEmbed 模型配置文件
|
||||
|
||||
使用 `fastembed` 后端时的预定义模型:
|
||||
|
||||
| 配置文件 | 模型 | 维度 | 大小 |
|
||||
|----------|------|------|------|
|
||||
| `fast` | BAAI/bge-small-en-v1.5 | 384 | 80MB |
|
||||
| `base` | BAAI/bge-base-en-v1.5 | 768 | 220MB |
|
||||
| `code` | jinaai/jina-embeddings-v2-base-code | 768 | 150MB |
|
||||
| `minilm` | sentence-transformers/all-MiniLM-L6-v2 | 384 | 90MB |
|
||||
| `multilingual` | intfloat/multilingual-e5-large | 1024 | 1000MB |
|
||||
| `balanced` | mixedbread-ai/mxbai-embed-large-v1 | 1024 | 600MB |
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 1. 使用全局配置
|
||||
|
||||
创建 `~/.codexlens/.env`:
|
||||
|
||||
```bash
|
||||
# 复制示例配置
|
||||
cp codex-lens/.env.example ~/.codexlens/.env
|
||||
|
||||
# 编辑配置
|
||||
nano ~/.codexlens/.env
|
||||
```
|
||||
|
||||
### 2. 使用本地嵌入 (fastembed)
|
||||
|
||||
```bash
|
||||
# 初始化索引 (使用 code 配置文件)
|
||||
codexlens init --backend fastembed --model code
|
||||
|
||||
# 或使用多语言模型
|
||||
codexlens init --backend fastembed --model multilingual
|
||||
```
|
||||
|
||||
### 3. 使用 API 嵌入 (litellm)
|
||||
|
||||
```bash
|
||||
# 设置环境变量
|
||||
export EMBEDDING_API_KEY=your-key
|
||||
export EMBEDDING_API_BASE=https://api.example.com/v1
|
||||
export EMBEDDING_MODEL=text-embedding-3-small
|
||||
|
||||
# 初始化索引
|
||||
codexlens init --backend litellm --model text-embedding-3-small
|
||||
```
|
||||
|
||||
### 4. 验证配置
|
||||
|
||||
```bash
|
||||
# 检查配置加载
|
||||
codexlens config show
|
||||
|
||||
# 测试嵌入
|
||||
codexlens test-embedding "Hello World"
|
||||
```
|
||||
|
||||
## 故障排除
|
||||
|
||||
### 配置未加载
|
||||
|
||||
检查文件权限和路径:
|
||||
|
||||
```bash
|
||||
ls -la ~/.codexlens/.env
|
||||
cat ~/.codexlens/.env
|
||||
```
|
||||
|
||||
### API 错误
|
||||
|
||||
1. 验证 API 密钥有效性
|
||||
2. 检查 API Base URL 是否正确
|
||||
3. 确认模型名称匹配提供商支持的模型
|
||||
|
||||
### 模型不兼容
|
||||
|
||||
如果更换嵌入模型,需要重建索引:
|
||||
|
||||
```bash
|
||||
# 删除旧索引
|
||||
rm -rf project/.codexlens/
|
||||
|
||||
# 重新初始化
|
||||
codexlens init --backend litellm --model new-model
|
||||
```
|
||||
|
||||
## 相关文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `src/codexlens/config.py` | 配置类定义 |
|
||||
| `src/codexlens/env_config.py` | 环境变量加载 |
|
||||
| `src/codexlens/cli/model_manager.py` | FastEmbed 模型管理 |
|
||||
| `src/codexlens/semantic/factory.py` | Embedder 工厂 |
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,540 +0,0 @@
|
||||
# Hybrid Search Architecture for CodexLens
|
||||
|
||||
> Embedding + Real-time LSP + Clustering + Reranking Pipeline
|
||||
|
||||
## Overview
|
||||
|
||||
This document describes the architecture for a hybrid intelligent code search system that combines:
|
||||
1. **Low-dimensional embedding model** for semantic search
|
||||
2. **Real-time LSP integration** for code structure analysis
|
||||
3. **Graph-based clustering** for result organization
|
||||
4. **Multi-factor reranking** for intelligent sorting
|
||||
|
||||
**Key Constraint**: Must use real-time LSP servers, NOT pre-indexed data.
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ HybridSearchEngine │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 5-Stage Search Pipeline │ │
|
||||
│ │ │ │
|
||||
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────┐│ │
|
||||
│ │ │ Stage 1 │──▶│ Stage 2 │──▶│ Stage 3 │──▶│ Stage 4 │──▶│ S5 ││ │
|
||||
│ │ │ Vector │ │ LSP │ │ Graph │ │Clustering│ │Rank││ │
|
||||
│ │ │ Search │ │Expansion │ │ Building │ │ +Filter │ │ ││ │
|
||||
│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └────┘│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────────┐ │
|
||||
│ │VectorSearchSvc │ │ LspBridge │ │ GraphBuilder │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ • Embedding │ │ • get_refs() │ │ • build_from_seeds() │ │
|
||||
│ │ • FAISS/HNSW │ │ • get_def() │ │ • add_relationships() │ │
|
||||
│ │ • search() │ │ • get_calls() │ │ • CodeAssociationGraph │ │
|
||||
│ └────────┬────────┘ └────────┬────────┘ └─────────────────────────────┘ │
|
||||
│ │ │ │
|
||||
└───────────┼────────────────────┼────────────────────────────────────────────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
┌───────────────┐ ┌───────────────────────────────────────┐
|
||||
│ Embedding │ │ LanguageServerMultiplexer │
|
||||
│ Model (local) │ │ (from REAL_LSP_SERVER_PLAN.md) │
|
||||
│ │ │ │
|
||||
│ sentence- │ │ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐│
|
||||
│ transformers │ │ │pylsp│ │gopls│ │tssvr│ │rust-anlzr││
|
||||
│ │ │ └─────┘ └─────┘ └─────┘ └──────────┘│
|
||||
└───────────────┘ └───────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. HybridSearchEngine (`hybrid_search/engine.py`)
|
||||
|
||||
**Role**: Main orchestrator coordinating all services
|
||||
|
||||
```python
|
||||
class HybridSearchEngine:
|
||||
def __init__(self):
|
||||
self.vector_service: VectorSearchService
|
||||
self.lsp_bridge: LspBridge
|
||||
self.graph_builder: GraphBuilder
|
||||
self.clustering_service: ClusteringService
|
||||
self.ranking_service: RankingService
|
||||
|
||||
async def search(self, query: str, top_k: int = 10) -> List[SearchResultCluster]:
|
||||
# Stage 1: Vector search for seeds
|
||||
seeds = await self.vector_service.search(query, top_k=top_k * 2)
|
||||
|
||||
# Stage 2-3: LSP expansion + Graph building
|
||||
graph = await self.graph_builder.build_from_seeds(seeds, self.lsp_bridge)
|
||||
|
||||
# Stage 4: Clustering + Filtering
|
||||
clusters = self.clustering_service.cluster(graph)
|
||||
clusters = self.clustering_service.filter_noise(clusters)
|
||||
|
||||
# Stage 5: Reranking
|
||||
ranked = self.ranking_service.rerank(clusters, seeds, query)
|
||||
|
||||
return ranked[:top_k]
|
||||
```
|
||||
|
||||
### 2. Data Structures (`hybrid_search/data_structures.py`)
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class CodeSymbolNode:
|
||||
"""Graph node representing a code symbol"""
|
||||
id: str # Unique: file_path:name:line
|
||||
name: str # Symbol name
|
||||
kind: str # function, class, method, variable
|
||||
file_path: str # Absolute file path
|
||||
range: Range # Start/end line and character
|
||||
embedding: Optional[List[float]] = None
|
||||
raw_code: str = ""
|
||||
docstring: str = ""
|
||||
|
||||
@dataclass
|
||||
class CodeAssociationGraph:
|
||||
"""Graph of code relationships"""
|
||||
nodes: Dict[str, CodeSymbolNode]
|
||||
edges: List[Tuple[str, str, str]] # (from_id, to_id, relationship_type)
|
||||
# relationship_type: 'calls', 'references', 'inherits', 'imports'
|
||||
|
||||
def to_networkx(self) -> nx.DiGraph:
|
||||
"""Convert to NetworkX for algorithms"""
|
||||
...
|
||||
|
||||
@dataclass
|
||||
class SearchResultCluster:
|
||||
"""Clustered search result"""
|
||||
cluster_id: str
|
||||
score: float
|
||||
title: str # AI-generated summary (optional)
|
||||
symbols: List[CodeSymbolNode]
|
||||
metadata: Dict[str, Any]
|
||||
```
|
||||
|
||||
### 3. VectorSearchService (`services/vector_search.py`)
|
||||
|
||||
**Role**: Semantic search using embeddings
|
||||
|
||||
```python
|
||||
class VectorSearchService:
|
||||
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
||||
self.model = SentenceTransformer(model_name) # 384-dim, fast
|
||||
self.index: faiss.IndexFlatIP # or hnswlib for larger scale
|
||||
self.id_to_symbol: Dict[str, CodeSymbolNode]
|
||||
|
||||
async def index_codebase(self, symbols: List[CodeSymbolNode]):
|
||||
"""Build/update vector index from symbols"""
|
||||
texts = [f"{s.name} {s.docstring} {s.raw_code[:500]}" for s in symbols]
|
||||
embeddings = self.model.encode(texts, normalize_embeddings=True)
|
||||
self.index.add(embeddings)
|
||||
|
||||
async def search(self, query: str, top_k: int) -> List[CodeSymbolNode]:
|
||||
"""Find semantically similar symbols"""
|
||||
query_vec = self.model.encode([query], normalize_embeddings=True)
|
||||
scores, indices = self.index.search(query_vec, top_k)
|
||||
return [self.id_to_symbol[i] for i in indices[0]]
|
||||
```
|
||||
|
||||
**Embedding Model Selection**:
|
||||
| Model | Dimensions | Speed | Quality |
|
||||
|-------|-----------|-------|---------|
|
||||
| all-MiniLM-L6-v2 | 384 | Fast | Good |
|
||||
| all-mpnet-base-v2 | 768 | Medium | Better |
|
||||
| CodeBERT | 768 | Medium | Code-optimized |
|
||||
|
||||
### 4. LspBridge (`services/lsp_bridge.py`)
|
||||
|
||||
**Role**: Interface to real-time language servers via LanguageServerMultiplexer
|
||||
|
||||
```python
|
||||
class LspBridge:
|
||||
def __init__(self, multiplexer_url: str = "http://localhost:3458"):
|
||||
self.multiplexer_url = multiplexer_url
|
||||
self.cache: Dict[str, CacheEntry] = {} # file_path -> (mtime, data)
|
||||
self.session = aiohttp.ClientSession()
|
||||
|
||||
async def get_references(self, symbol: CodeSymbolNode) -> List[Location]:
|
||||
"""Get all references to a symbol (real-time LSP)"""
|
||||
cache_key = f"refs:{symbol.id}"
|
||||
if self._is_cached(cache_key, symbol.file_path):
|
||||
return self.cache[cache_key].data
|
||||
|
||||
response = await self._lsp_request("textDocument/references", {
|
||||
"textDocument": {"uri": f"file://{symbol.file_path}"},
|
||||
"position": {"line": symbol.range.start.line,
|
||||
"character": symbol.range.start.character},
|
||||
"context": {"includeDeclaration": True}
|
||||
})
|
||||
|
||||
locations = self._parse_locations(response)
|
||||
self._cache(cache_key, symbol.file_path, locations)
|
||||
return locations
|
||||
|
||||
async def get_call_hierarchy(self, symbol: CodeSymbolNode) -> List[CallHierarchyItem]:
|
||||
"""Get incoming/outgoing calls (if supported by language server)"""
|
||||
try:
|
||||
# Prepare call hierarchy
|
||||
items = await self._lsp_request("textDocument/prepareCallHierarchy", {...})
|
||||
if not items:
|
||||
# Fallback to references if callHierarchy not supported
|
||||
return await self._fallback_to_references(symbol)
|
||||
|
||||
# Get incoming calls
|
||||
incoming = await self._lsp_request("callHierarchy/incomingCalls",
|
||||
{"item": items[0]})
|
||||
return incoming
|
||||
except LspCapabilityNotSupported:
|
||||
return await self._fallback_to_references(symbol)
|
||||
|
||||
async def get_definition(self, symbol: CodeSymbolNode) -> Optional[Location]:
|
||||
"""Get symbol definition location"""
|
||||
...
|
||||
|
||||
async def get_hover(self, symbol: CodeSymbolNode) -> Optional[str]:
|
||||
"""Get hover documentation"""
|
||||
...
|
||||
```
|
||||
|
||||
**Caching Strategy**:
|
||||
- Cache key: `{operation}:{symbol_id}`
|
||||
- Invalidation: Check file modification time
|
||||
- TTL: 5 minutes for frequently accessed files
|
||||
|
||||
**Concurrency Control**:
|
||||
- Max concurrent LSP requests: 10
|
||||
- Request timeout: 2 seconds
|
||||
- Batch requests where possible
|
||||
|
||||
### 5. GraphBuilder (`graph/builder.py`)
|
||||
|
||||
**Role**: Build code association graph from seeds using LSP
|
||||
|
||||
```python
|
||||
class GraphBuilder:
|
||||
def __init__(self, max_depth: int = 2, max_nodes: int = 100):
|
||||
self.max_depth = max_depth
|
||||
self.max_nodes = max_nodes
|
||||
|
||||
async def build_from_seeds(
|
||||
self,
|
||||
seeds: List[CodeSymbolNode],
|
||||
lsp_bridge: LspBridge
|
||||
) -> CodeAssociationGraph:
|
||||
"""Build association graph by expanding from seed nodes"""
|
||||
graph = CodeAssociationGraph()
|
||||
visited: Set[str] = set()
|
||||
queue: List[Tuple[CodeSymbolNode, int]] = [(s, 0) for s in seeds]
|
||||
|
||||
# Parallel expansion with semaphore
|
||||
sem = asyncio.Semaphore(10)
|
||||
|
||||
async def expand_node(node: CodeSymbolNode, depth: int):
|
||||
if node.id in visited or depth > self.max_depth:
|
||||
return
|
||||
if len(graph.nodes) >= self.max_nodes:
|
||||
return
|
||||
|
||||
visited.add(node.id)
|
||||
graph.add_node(node)
|
||||
|
||||
async with sem:
|
||||
# Get relationships in parallel
|
||||
refs, calls = await asyncio.gather(
|
||||
lsp_bridge.get_references(node),
|
||||
lsp_bridge.get_call_hierarchy(node),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Add edges
|
||||
for ref in refs:
|
||||
ref_node = await self._location_to_node(ref, lsp_bridge)
|
||||
graph.add_edge(node.id, ref_node.id, "references")
|
||||
queue.append((ref_node, depth + 1))
|
||||
|
||||
for call in calls:
|
||||
call_node = await self._call_to_node(call, lsp_bridge)
|
||||
graph.add_edge(call_node.id, node.id, "calls")
|
||||
queue.append((call_node, depth + 1))
|
||||
|
||||
# BFS expansion
|
||||
while queue and len(graph.nodes) < self.max_nodes:
|
||||
batch = queue[:10]
|
||||
queue = queue[10:]
|
||||
await asyncio.gather(*[expand_node(n, d) for n, d in batch])
|
||||
|
||||
return graph
|
||||
```
|
||||
|
||||
### 6. ClusteringService (`clustering/algorithms.py`)
|
||||
|
||||
**Role**: Group related code symbols and filter noise
|
||||
|
||||
```python
|
||||
class ClusteringService:
|
||||
def __init__(self, resolution: float = 1.0):
|
||||
self.resolution = resolution # Higher = smaller clusters
|
||||
|
||||
def cluster(self, graph: CodeAssociationGraph) -> List[SearchResultCluster]:
|
||||
"""Apply Louvain community detection"""
|
||||
nx_graph = graph.to_networkx()
|
||||
|
||||
# Louvain algorithm
|
||||
communities = community_louvain.best_partition(
|
||||
nx_graph,
|
||||
resolution=self.resolution
|
||||
)
|
||||
|
||||
# Group nodes by community
|
||||
clusters: Dict[int, List[CodeSymbolNode]] = defaultdict(list)
|
||||
for node_id, community_id in communities.items():
|
||||
clusters[community_id].append(graph.nodes[node_id])
|
||||
|
||||
return [
|
||||
SearchResultCluster(
|
||||
cluster_id=f"cluster_{cid}",
|
||||
symbols=nodes,
|
||||
score=0.0, # Will be set by RankingService
|
||||
title="",
|
||||
metadata={"size": len(nodes)}
|
||||
)
|
||||
for cid, nodes in clusters.items()
|
||||
]
|
||||
|
||||
def filter_noise(self, clusters: List[SearchResultCluster]) -> List[SearchResultCluster]:
|
||||
"""Remove noisy clusters and symbols"""
|
||||
filtered = []
|
||||
for cluster in clusters:
|
||||
# Filter high-degree generic nodes
|
||||
cluster.symbols = [
|
||||
s for s in cluster.symbols
|
||||
if not self._is_generic_symbol(s)
|
||||
]
|
||||
|
||||
# Keep clusters with minimum size
|
||||
if len(cluster.symbols) >= 2:
|
||||
filtered.append(cluster)
|
||||
|
||||
return filtered
|
||||
|
||||
def _is_generic_symbol(self, symbol: CodeSymbolNode) -> bool:
|
||||
"""Check if symbol is too generic (log, print, etc.)"""
|
||||
generic_names = {'log', 'print', 'debug', 'error', 'warn',
|
||||
'get', 'set', 'init', '__init__', 'toString'}
|
||||
return symbol.name.lower() in generic_names
|
||||
```
|
||||
|
||||
### 7. RankingService (`ranking/service.py`)
|
||||
|
||||
**Role**: Multi-factor intelligent reranking
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class RankingWeights:
|
||||
text_relevance: float = 0.4 # w1
|
||||
graph_centrality: float = 0.35 # w2
|
||||
structural_proximity: float = 0.25 # w3
|
||||
|
||||
class RankingService:
|
||||
def __init__(self, weights: RankingWeights = None):
|
||||
self.weights = weights or RankingWeights()
|
||||
|
||||
def rerank(
|
||||
self,
|
||||
clusters: List[SearchResultCluster],
|
||||
seeds: List[CodeSymbolNode],
|
||||
query: str
|
||||
) -> List[SearchResultCluster]:
|
||||
"""Rerank clusters using multi-factor scoring"""
|
||||
seed_ids = {s.id for s in seeds}
|
||||
|
||||
for cluster in clusters:
|
||||
# Build cluster subgraph for centrality
|
||||
subgraph = self._build_subgraph(cluster)
|
||||
pagerank = nx.pagerank(subgraph)
|
||||
|
||||
for symbol in cluster.symbols:
|
||||
# Factor 1: Text relevance (from vector search)
|
||||
text_score = self._compute_text_relevance(symbol, query)
|
||||
|
||||
# Factor 2: Graph centrality (PageRank in cluster)
|
||||
centrality_score = pagerank.get(symbol.id, 0.0)
|
||||
|
||||
# Factor 3: Structural proximity to seeds
|
||||
proximity_score = self._compute_proximity(symbol, seed_ids, subgraph)
|
||||
|
||||
# Combined score
|
||||
symbol.score = (
|
||||
self.weights.text_relevance * text_score +
|
||||
self.weights.graph_centrality * centrality_score +
|
||||
self.weights.structural_proximity * proximity_score
|
||||
)
|
||||
|
||||
# Cluster score = max symbol score
|
||||
cluster.score = max(s.score for s in cluster.symbols)
|
||||
cluster.symbols.sort(key=lambda s: s.score, reverse=True)
|
||||
|
||||
# Sort clusters by score
|
||||
clusters.sort(key=lambda c: c.score, reverse=True)
|
||||
return clusters
|
||||
|
||||
def _compute_proximity(
|
||||
self,
|
||||
symbol: CodeSymbolNode,
|
||||
seed_ids: Set[str],
|
||||
graph: nx.DiGraph
|
||||
) -> float:
|
||||
"""Compute proximity score based on shortest path to seeds"""
|
||||
if symbol.id in seed_ids:
|
||||
return 1.0
|
||||
|
||||
min_distance = float('inf')
|
||||
for seed_id in seed_ids:
|
||||
try:
|
||||
distance = nx.shortest_path_length(graph, seed_id, symbol.id)
|
||||
min_distance = min(min_distance, distance)
|
||||
except nx.NetworkXNoPath:
|
||||
continue
|
||||
|
||||
if min_distance == float('inf'):
|
||||
return 0.0
|
||||
|
||||
# Inverse distance scoring (closer = higher)
|
||||
return 1.0 / (1.0 + min_distance)
|
||||
```
|
||||
|
||||
## API Design
|
||||
|
||||
### Endpoint: `POST /api/v1/hybrid-search`
|
||||
|
||||
**Request**:
|
||||
```json
|
||||
{
|
||||
"query": "user authentication flow",
|
||||
"top_k": 10,
|
||||
"config_overrides": {
|
||||
"ranking_weights": {"w1": 0.5, "w2": 0.3, "w3": 0.2},
|
||||
"max_graph_depth": 2,
|
||||
"clustering_resolution": 1.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"query_id": "hs-20250120-001",
|
||||
"execution_time_ms": 1250,
|
||||
"results": [
|
||||
{
|
||||
"cluster_id": "cluster_0",
|
||||
"score": 0.92,
|
||||
"title": "User Authentication Handler",
|
||||
"symbols": [
|
||||
{
|
||||
"id": "src/auth/handler.py:authenticate:45",
|
||||
"name": "authenticate",
|
||||
"kind": "function",
|
||||
"file_path": "src/auth/handler.py",
|
||||
"range": {"start": {"line": 45, "char": 0}, "end": {"line": 78, "char": 0}},
|
||||
"score": 0.95,
|
||||
"raw_code": "async def authenticate(request: Request):\n ..."
|
||||
},
|
||||
{
|
||||
"id": "src/auth/handler.py:validate_token:80",
|
||||
"name": "validate_token",
|
||||
"kind": "function",
|
||||
"file_path": "src/auth/handler.py",
|
||||
"score": 0.88,
|
||||
"raw_code": "def validate_token(token: str) -> bool:\n ..."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Implementation Priorities
|
||||
|
||||
### P0 - Core Infrastructure (Week 1-2)
|
||||
1. **HybridSearchEngine skeleton** - Basic orchestration without all features
|
||||
2. **LspBridge with caching** - Connect to LanguageServerMultiplexer
|
||||
3. **GraphBuilder basic** - Seed expansion with references only
|
||||
4. **Integration test** - Verify LSP communication works
|
||||
|
||||
### P1 - Search Pipeline (Week 2-3)
|
||||
1. **VectorSearchService** - Embedding model + FAISS index
|
||||
2. **ClusteringService** - Louvain algorithm + noise filtering
|
||||
3. **End-to-end pipeline** - Query to clustered results
|
||||
|
||||
### P2 - Ranking & API (Week 3-4)
|
||||
1. **RankingService** - Multi-factor scoring
|
||||
2. **API endpoint** - FastAPI integration
|
||||
3. **Performance optimization** - Caching, parallelization, timeouts
|
||||
4. **Configuration system** - Dynamic weight adjustment
|
||||
|
||||
## Performance Targets
|
||||
|
||||
| Metric | Target | Strategy |
|
||||
|--------|--------|----------|
|
||||
| End-to-end latency | < 2s | Parallel LSP calls, aggressive caching |
|
||||
| Vector search | < 100ms | FAISS with GPU (optional) |
|
||||
| LSP expansion | < 1s | Max 10 concurrent requests, 2s timeout |
|
||||
| Clustering | < 200ms | Limit graph size to 100 nodes |
|
||||
| Reranking | < 100ms | Pre-computed embeddings |
|
||||
|
||||
## Dependencies
|
||||
|
||||
### External
|
||||
- LanguageServerMultiplexer (from REAL_LSP_SERVER_PLAN.md)
|
||||
- Language servers: pylsp, tsserver, gopls, rust-analyzer
|
||||
|
||||
### Python Libraries
|
||||
- `sentence-transformers` - Embedding models
|
||||
- `faiss-cpu` or `hnswlib` - Vector indexing
|
||||
- `networkx` - Graph algorithms
|
||||
- `python-louvain` - Community detection
|
||||
- `aiohttp` - Async HTTP client
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/codexlens/
|
||||
├── hybrid_search/
|
||||
│ ├── __init__.py
|
||||
│ ├── engine.py # HybridSearchEngine
|
||||
│ ├── pipeline.py # Pipeline stage definitions
|
||||
│ └── data_structures.py # CodeSymbolNode, Graph, Cluster
|
||||
├── services/
|
||||
│ ├── vector_search.py # VectorSearchService
|
||||
│ └── lsp_bridge.py # LspBridge
|
||||
├── graph/
|
||||
│ └── builder.py # GraphBuilder
|
||||
├── clustering/
|
||||
│ └── algorithms.py # ClusteringService
|
||||
├── ranking/
|
||||
│ └── service.py # RankingService
|
||||
├── api/
|
||||
│ └── endpoints.py # API routes
|
||||
└── configs/
|
||||
└── hybrid_search_config.py
|
||||
```
|
||||
|
||||
## Risk Mitigation
|
||||
|
||||
| Risk | Impact | Mitigation |
|
||||
|------|--------|------------|
|
||||
| LSP timeout | High | Fallback to vector-only results |
|
||||
| LSP not available | High | Graceful degradation to CodexLens index |
|
||||
| Large codebases | Medium | Limit graph expansion, pagination |
|
||||
| Language server crash | Medium | Auto-restart, circuit breaker |
|
||||
| Clustering quality | Low | Tunable resolution parameter |
|
||||
|
||||
---
|
||||
|
||||
*Generated from Gemini analysis (Session: 1768836775699-gemini)*
|
||||
*Date: 2025-01-20*
|
||||
@@ -1,363 +0,0 @@
|
||||
# CodexLens Real LSP Implementation - Summary
|
||||
|
||||
> **Date**: 2026-01-19
|
||||
> **Status**: Planning Complete, Implementation Ready
|
||||
> **Focus**: Real LSP Server + VSCode Bridge Integration
|
||||
|
||||
---
|
||||
|
||||
## ✅ Completed Work
|
||||
|
||||
### 1. Planning Documents
|
||||
|
||||
#### a. Main Implementation Plan
|
||||
**File**: `docs/REAL_LSP_SERVER_PLAN.md`
|
||||
|
||||
**Content**:
|
||||
- Complete architecture design for real LSP server
|
||||
- 5-phase implementation plan
|
||||
- Multi-language support strategy (TypeScript, Python, Go, Rust, Java, C/C++)
|
||||
- Language server multiplexer design
|
||||
- Position tolerance feature (cclsp-like)
|
||||
- MCP integration layer
|
||||
|
||||
**Key Decisions**:
|
||||
- Use `pygls` library for LSP implementation
|
||||
- Support 6+ language servers via multiplexer
|
||||
- Implement position tolerance for fuzzy AI-generated positions
|
||||
- Three integration paths: Standalone LSP, VSCode Bridge, Index-based fallback
|
||||
|
||||
#### b. VSCode Bridge Implementation (Appendix A)
|
||||
**Included in**: `docs/REAL_LSP_SERVER_PLAN.md`
|
||||
|
||||
**Content**:
|
||||
- HTTP-based VSCode extension bridge
|
||||
- MCP tool integration (vscode_lsp)
|
||||
- Complete architecture diagram
|
||||
- API endpoint specifications
|
||||
- Comparison with standalone LSP approach
|
||||
|
||||
### 2. VSCode Bridge Extension
|
||||
|
||||
#### Created Files:
|
||||
1. **`ccw-vscode-bridge/package.json`**
|
||||
- VSCode extension manifest
|
||||
- Dependencies: @types/node, @types/vscode, typescript
|
||||
|
||||
2. **`ccw-vscode-bridge/tsconfig.json`**
|
||||
- TypeScript compilation configuration
|
||||
- Target: ES2020, CommonJS modules
|
||||
|
||||
3. **`ccw-vscode-bridge/src/extension.ts`**
|
||||
- HTTP server on port 3457
|
||||
- 4 API endpoints:
|
||||
- `POST /get_definition`
|
||||
- `POST /get_references`
|
||||
- `POST /get_hover`
|
||||
- `POST /get_document_symbols`
|
||||
- VSCode API integration via `vscode.commands.executeCommand`
|
||||
|
||||
4. **`ccw-vscode-bridge/.vscodeignore`**
|
||||
- Build artifact exclusion rules
|
||||
|
||||
5. **`ccw-vscode-bridge/README.md`**
|
||||
- Installation & usage instructions
|
||||
- API endpoint documentation
|
||||
|
||||
#### Features:
|
||||
- ✅ Real-time VSCode LSP integration
|
||||
- ✅ HTTP REST API for external tools
|
||||
- ✅ CORS support
|
||||
- ✅ Error handling
|
||||
- ✅ Automatic VSCode feature detection
|
||||
|
||||
### 3. CCW MCP Tool
|
||||
|
||||
#### Created File:
|
||||
**`ccw/src/tools/vscode-lsp.ts`**
|
||||
|
||||
**Features**:
|
||||
- ✅ 4 LSP actions: get_definition, get_references, get_hover, get_document_symbols
|
||||
- ✅ Zod schema validation
|
||||
- ✅ HTTP client with timeout (10s)
|
||||
- ✅ Connection retry logic
|
||||
- ✅ Comprehensive error messages
|
||||
|
||||
**Parameters**:
|
||||
- `action` (required): LSP action type
|
||||
- `file_path` (required): Absolute file path
|
||||
- `line` (optional): Line number (1-based)
|
||||
- `character` (optional): Character position (1-based)
|
||||
|
||||
#### Integration:
|
||||
**Modified File**: `ccw/src/tools/index.ts`
|
||||
|
||||
- ✅ Imported `vscodeLspMod`
|
||||
- ✅ Registered tool via `registerTool(toLegacyTool(vscodeLspMod))`
|
||||
- ✅ Available in MCP server tool list
|
||||
|
||||
---
|
||||
|
||||
## 📋 Implementation Architecture
|
||||
|
||||
### Three Integration Paths
|
||||
|
||||
```
|
||||
Path 1: VSCode Bridge (✅ Implemented)
|
||||
─────────────────────────────────────
|
||||
Claude Code → vscode_lsp MCP tool → HTTP → ccw-vscode-bridge → VSCode API → Language Servers
|
||||
|
||||
Path 2: Standalone LSP Server (📝 Planned)
|
||||
──────────────────────────────────────────
|
||||
Any LSP Client → codexlens-lsp → Language Server Multiplexer → Language Servers
|
||||
|
||||
Path 3: Index-Based (✅ Existing)
|
||||
─────────────────────────────────
|
||||
Claude Code → codex_lens_lsp → Python API → SQLite Index → Cached Results
|
||||
```
|
||||
|
||||
### Smart Routing Strategy
|
||||
|
||||
```javascript
|
||||
// Priority: VSCode Bridge → Standalone LSP → Index-based
|
||||
if (vscodeBridgeAvailable) {
|
||||
return useVSCodeBridge();
|
||||
} else if (standaloneLSPAvailable) {
|
||||
return useStandaloneLSP();
|
||||
} else {
|
||||
return useIndexBased();
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Next Steps
|
||||
|
||||
### Immediate Actions (Phase 1)
|
||||
|
||||
1. **Test VSCode Bridge**
|
||||
```bash
|
||||
cd ccw-vscode-bridge
|
||||
npm install
|
||||
npm run compile
|
||||
# Press F5 in VSCode to launch extension
|
||||
```
|
||||
|
||||
2. **Test vscode_lsp Tool**
|
||||
```bash
|
||||
# Start CCW MCP server
|
||||
cd ccw
|
||||
npm run mcp
|
||||
|
||||
# Test via MCP client
|
||||
{
|
||||
"tool": "vscode_lsp",
|
||||
"arguments": {
|
||||
"action": "get_definition",
|
||||
"file_path": "/path/to/file.ts",
|
||||
"line": 10,
|
||||
"character": 5
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
3. **Document Testing Results**
|
||||
- Create test reports
|
||||
- Benchmark latency
|
||||
- Validate accuracy
|
||||
|
||||
### Medium-Term Goals (Phase 2-3)
|
||||
|
||||
1. **Implement Standalone LSP Server**
|
||||
- Setup `codexlens-lsp` project structure
|
||||
- Implement language server multiplexer
|
||||
- Add core LSP handlers
|
||||
|
||||
2. **Add Position Tolerance**
|
||||
- Implement fuzzy position matching
|
||||
- Test with AI-generated positions
|
||||
|
||||
3. **Create Integration Tests**
|
||||
- Unit tests for each component
|
||||
- E2E tests with real language servers
|
||||
- Performance benchmarks
|
||||
|
||||
### Long-Term Goals (Phase 4-5)
|
||||
|
||||
1. **MCP Context Enhancement**
|
||||
- Integrate LSP results into MCP context
|
||||
- Hook system for Claude Code
|
||||
|
||||
2. **Advanced Features**
|
||||
- Code actions
|
||||
- Formatting
|
||||
- Rename support
|
||||
|
||||
3. **Production Deployment**
|
||||
- Package VSCode extension to .vsix
|
||||
- Publish to VS Code marketplace
|
||||
- Create installation scripts
|
||||
|
||||
---
|
||||
|
||||
## 📊 Project Status Matrix
|
||||
|
||||
| Component | Status | Files | Tests | Docs |
|
||||
|-----------|--------|-------|-------|------|
|
||||
| VSCode Bridge Extension | ✅ Complete | 5/5 | ⏳ Pending | ✅ Complete |
|
||||
| vscode_lsp MCP Tool | ✅ Complete | 1/1 | ⏳ Pending | ✅ Complete |
|
||||
| Tool Registration | ✅ Complete | 1/1 | N/A | N/A |
|
||||
| Planning Documents | ✅ Complete | 2/2 | N/A | ✅ Complete |
|
||||
| Standalone LSP Server | 📝 Planned | 0/8 | 0/12 | ✅ Complete |
|
||||
| Integration Tests | 📝 Planned | 0/3 | 0/15 | ⏳ Pending |
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Development Environment
|
||||
|
||||
### Prerequisites
|
||||
|
||||
**For VSCode Bridge**:
|
||||
- Node.js ≥ 18
|
||||
- VSCode ≥ 1.80
|
||||
- TypeScript ≥ 5.0
|
||||
|
||||
**For Standalone LSP**:
|
||||
- Python ≥ 3.8
|
||||
- pygls ≥ 1.3.0
|
||||
- Language servers:
|
||||
- TypeScript: `npm i -g typescript-language-server`
|
||||
- Python: `pip install python-lsp-server`
|
||||
- Go: `go install golang.org/x/tools/gopls@latest`
|
||||
- Rust: `rustup component add rust-analyzer`
|
||||
|
||||
### Installation Commands
|
||||
|
||||
```bash
|
||||
# VSCode Bridge
|
||||
cd ccw-vscode-bridge
|
||||
npm install
|
||||
npm run compile
|
||||
|
||||
# CCW MCP (already setup)
|
||||
cd ccw
|
||||
npm install
|
||||
|
||||
# Future: Standalone LSP
|
||||
cd codex-lens
|
||||
pip install -e ".[lsp]"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📖 Documentation Index
|
||||
|
||||
| Document | Purpose | Status |
|
||||
|----------|---------|--------|
|
||||
| `REAL_LSP_SERVER_PLAN.md` | Complete implementation plan | ✅ |
|
||||
| `LSP_INTEGRATION_PLAN.md` | Original integration strategy | ✅ |
|
||||
| `MCP_ENDPOINT_DESIGN.md` | MCP endpoint specifications | ✅ |
|
||||
| `IMPLEMENTATION_SUMMARY.md` | This document | ✅ |
|
||||
| `ccw-vscode-bridge/README.md` | Bridge usage guide | ✅ |
|
||||
| `TESTING_GUIDE.md` | Testing procedures | ⏳ TODO |
|
||||
| `DEPLOYMENT_GUIDE.md` | Production deployment | ⏳ TODO |
|
||||
|
||||
---
|
||||
|
||||
## 💡 Key Design Decisions
|
||||
|
||||
### 1. Why Three Integration Paths?
|
||||
|
||||
- **VSCode Bridge**: Easiest setup, leverages VSCode's built-in language servers
|
||||
- **Standalone LSP**: IDE-agnostic, works with any LSP client
|
||||
- **Index-based**: Fallback for offline or cached queries
|
||||
|
||||
### 2. Why HTTP for VSCode Bridge?
|
||||
|
||||
- ✅ Simplest cross-process communication
|
||||
- ✅ No complex IPC/socket management
|
||||
- ✅ Easy to debug with curl/Postman
|
||||
- ✅ CORS support for web-based tools
|
||||
|
||||
### 3. Why Port 3457?
|
||||
|
||||
- Unique port unlikely to conflict
|
||||
- Easy to remember (345-7)
|
||||
- Same approach as cclsp (uses stdio)
|
||||
|
||||
### 4. Why Not Modify smart_search?
|
||||
|
||||
User feedback:
|
||||
> "第一种跟当前的符号搜索没区别哎"
|
||||
> (Method 1 has no difference from current symbol search)
|
||||
|
||||
**Solution**: Implement real LSP server that connects to live language servers, not pre-indexed data.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Quick Start Guide
|
||||
|
||||
### Test VSCode Bridge Now
|
||||
|
||||
1. **Install Extension**:
|
||||
```bash
|
||||
cd ccw-vscode-bridge
|
||||
npm install && npm run compile
|
||||
code --install-extension .
|
||||
```
|
||||
|
||||
2. **Reload VSCode**:
|
||||
- Press `Cmd+Shift+P` (Mac) or `Ctrl+Shift+P` (Windows)
|
||||
- Type "Reload Window"
|
||||
|
||||
3. **Verify Bridge is Running**:
|
||||
```bash
|
||||
curl http://localhost:3457/get_definition \
|
||||
-X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"file_path":"/path/to/file.ts","line":10,"character":5}'
|
||||
```
|
||||
|
||||
4. **Test via CCW**:
|
||||
```javascript
|
||||
// In Claude Code or MCP client
|
||||
await executeTool('vscode_lsp', {
|
||||
action: 'get_definition',
|
||||
file_path: '/absolute/path/to/file.ts',
|
||||
line: 10,
|
||||
character: 5
|
||||
});
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📞 Support & Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Issue**: "Could not connect to VSCode Bridge"
|
||||
**Solution**:
|
||||
1. Ensure VSCode is running
|
||||
2. Check if extension is activated: `Cmd+Shift+P` → "CCW VSCode Bridge"
|
||||
3. Verify port 3457 is not in use: `lsof -i :3457`
|
||||
|
||||
**Issue**: "No LSP server available"
|
||||
**Solution**: Open the file in VSCode workspace first
|
||||
|
||||
**Issue**: "File not found"
|
||||
**Solution**: Use absolute paths, not relative
|
||||
|
||||
---
|
||||
|
||||
## 📝 Change Log
|
||||
|
||||
### 2026-01-19 - Initial Implementation
|
||||
- Created VSCode Bridge extension (5 files)
|
||||
- Implemented vscode_lsp MCP tool
|
||||
- Registered tool in CCW registry
|
||||
- Completed planning documentation
|
||||
- Added comprehensive architecture diagrams
|
||||
|
||||
---
|
||||
|
||||
**Document End**
|
||||
@@ -1,342 +0,0 @@
|
||||
# LLM增强功能移除总结
|
||||
|
||||
**移除日期**: 2025-12-16
|
||||
**执行者**: 用户请求
|
||||
**状态**: ✅ 完成
|
||||
|
||||
---
|
||||
|
||||
## 📋 移除清单
|
||||
|
||||
### ✅ 已删除的源代码文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `src/codexlens/semantic/llm_enhancer.py` | LLM增强核心模块 (900+ lines) |
|
||||
|
||||
### ✅ 已修改的源代码文件
|
||||
|
||||
| 文件 | 修改内容 |
|
||||
|------|---------|
|
||||
| `src/codexlens/cli/commands.py` | 删除 `enhance` 命令 (lines 1050-1227) |
|
||||
| `src/codexlens/semantic/__init__.py` | 删除LLM相关导出 (lines 35-69) |
|
||||
|
||||
### ✅ 已修改的前端文件(CCW Dashboard)
|
||||
|
||||
| 文件 | 修改内容 |
|
||||
|------|---------|
|
||||
| `ccw/src/templates/dashboard-js/components/cli-status.js` | 删除LLM增强设置 (8行)、Semantic Settings Modal (615行)、Metadata Viewer (326行) |
|
||||
| `ccw/src/templates/dashboard-js/i18n.js` | 删除英文LLM翻译 (26行)、中文LLM翻译 (26行) |
|
||||
| `ccw/src/templates/dashboard-js/views/cli-manager.js` | 移除LLM badge和设置modal调用 (3行) |
|
||||
|
||||
### ✅ 已删除的测试文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `tests/test_llm_enhancer.py` | LLM增强单元测试 |
|
||||
| `tests/test_llm_enhanced_search.py` | LLM vs 纯向量对比测试 (550+ lines) |
|
||||
|
||||
### ✅ 已删除的脚本文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `scripts/compare_search_methods.py` | 纯向量 vs LLM增强对比脚本 (460+ lines) |
|
||||
| `scripts/test_misleading_comments.py` | 误导性注释测试脚本 (490+ lines) |
|
||||
| `scripts/show_llm_analysis.py` | LLM分析展示工具 |
|
||||
| `scripts/inspect_llm_summaries.py` | LLM摘要检查工具 |
|
||||
|
||||
### ✅ 已删除的文档文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `docs/LLM_ENHANCED_SEARCH_GUIDE.md` | LLM增强使用指南 (460+ lines) |
|
||||
| `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` | LLM测试结果文档 |
|
||||
| `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` | 误导性注释测试结果 |
|
||||
| `docs/CLI_INTEGRATION_SUMMARY.md` | CLI集成文档(包含enhance命令) |
|
||||
| `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` | Docstring与LLM混合策略设计 |
|
||||
|
||||
### ✅ 已更新的文档
|
||||
|
||||
| 文件 | 修改内容 |
|
||||
|------|---------|
|
||||
| `docs/IMPLEMENTATION_SUMMARY.md` | 添加LLM移除说明,列出已删除内容 |
|
||||
|
||||
### 📚 保留的设计文档(作为历史参考)
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `docs/DESIGN_EVALUATION_REPORT.md` | 包含LLM混合策略的技术评估报告 |
|
||||
| `docs/SEMANTIC_GRAPH_DESIGN.md` | 语义图谱设计(可能提及LLM) |
|
||||
| `docs/MULTILEVEL_CHUNKER_DESIGN.md` | 多层次分词器设计(可能提及LLM) |
|
||||
|
||||
*这些文档保留作为技术历史参考,不影响当前功能。*
|
||||
|
||||
---
|
||||
|
||||
## 🔒 移除的功能
|
||||
|
||||
### CLI命令
|
||||
|
||||
```bash
|
||||
# 已移除 - 不再可用
|
||||
codexlens enhance [PATH] --tool gemini --batch-size 5
|
||||
|
||||
# 说明:此命令用于通过CCW CLI调用Gemini/Qwen生成代码摘要
|
||||
# 移除原因:减少外部依赖,简化维护
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
# 已移除 - 不再可用
|
||||
from codexlens.semantic import (
|
||||
LLMEnhancer,
|
||||
LLMConfig,
|
||||
SemanticMetadata,
|
||||
FileData,
|
||||
EnhancedSemanticIndexer,
|
||||
create_enhancer,
|
||||
create_enhanced_indexer,
|
||||
)
|
||||
|
||||
# 移除的类和函数:
|
||||
# - LLMEnhancer: LLM增强器主类
|
||||
# - LLMConfig: LLM配置类
|
||||
# - SemanticMetadata: 语义元数据结构
|
||||
# - FileData: 文件数据结构
|
||||
# - EnhancedSemanticIndexer: LLM增强索引器
|
||||
# - create_enhancer(): 创建增强器的工厂函数
|
||||
# - create_enhanced_indexer(): 创建增强索引器的工厂函数
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ 保留的功能
|
||||
|
||||
### 完全保留的核心功能
|
||||
|
||||
| 功能 | 状态 |
|
||||
|------|------|
|
||||
| **纯向量搜索** | ✅ 完整保留 |
|
||||
| **语义嵌入生成** | ✅ 完整保留 (`codexlens embeddings-generate`) |
|
||||
| **语义嵌入状态检查** | ✅ 完整保留 (`codexlens embeddings-status`) |
|
||||
| **混合搜索引擎** | ✅ 完整保留(exact + fuzzy + vector) |
|
||||
| **向量存储** | ✅ 完整保留 |
|
||||
| **语义分块** | ✅ 完整保留 |
|
||||
| **fastembed集成** | ✅ 完整保留 |
|
||||
|
||||
### 可用的CLI命令
|
||||
|
||||
```bash
|
||||
# 生成纯向量嵌入(无需LLM)
|
||||
codexlens embeddings-generate [PATH]
|
||||
|
||||
# 检查嵌入状态
|
||||
codexlens embeddings-status [PATH]
|
||||
|
||||
# 所有搜索命令
|
||||
codexlens search [QUERY] --index [PATH]
|
||||
|
||||
# 所有索引管理命令
|
||||
codexlens init [PATH]
|
||||
codexlens update [PATH]
|
||||
codexlens clean [PATH]
|
||||
```
|
||||
|
||||
### 可用的Python API
|
||||
|
||||
```python
|
||||
# 完全可用 - 纯向量搜索
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
|
||||
# 示例:纯向量搜索
|
||||
engine = HybridSearchEngine()
|
||||
results = engine.search(
|
||||
index_path,
|
||||
query="your search query",
|
||||
enable_vector=True,
|
||||
pure_vector=True, # 纯向量模式
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 移除原因
|
||||
|
||||
### 1. 简化依赖
|
||||
|
||||
**移除的外部依赖**:
|
||||
- CCW CLI (npm package)
|
||||
- Gemini API (需要API密钥)
|
||||
- Qwen API (可选)
|
||||
|
||||
**保留的依赖**:
|
||||
- fastembed (ONNX-based,轻量级)
|
||||
- numpy
|
||||
- Python标准库
|
||||
|
||||
### 2. 减少复杂性
|
||||
|
||||
- **前**: 两种搜索方式(纯向量 + LLM增强)
|
||||
- **后**: 一种搜索方式(纯向量)
|
||||
- 移除了900+ lines的LLM增强代码
|
||||
- 移除了CLI命令和相关配置
|
||||
- 移除了测试和文档
|
||||
|
||||
### 3. 性能考虑
|
||||
|
||||
| 方面 | LLM增强 | 纯向量 |
|
||||
|------|---------|--------|
|
||||
| **索引速度** | 慢75倍 | 基准 |
|
||||
| **查询速度** | 相同 | 相同 |
|
||||
| **准确率** | 相同* | 基准 |
|
||||
| **成本** | API费用 | 免费 |
|
||||
|
||||
*在测试数据集上准确率相同(5/5),但LLM增强理论上在更复杂场景下可能更好
|
||||
|
||||
### 4. 维护负担
|
||||
|
||||
**移除前**:
|
||||
- 需要维护CCW CLI集成
|
||||
- 需要处理API限流和错误
|
||||
- 需要测试多个LLM后端
|
||||
- 需要维护批处理逻辑
|
||||
|
||||
**移除后**:
|
||||
- 单一嵌入引擎(fastembed)
|
||||
- 无外部API依赖
|
||||
- 更简单的错误处理
|
||||
- 更容易测试
|
||||
|
||||
---
|
||||
|
||||
## 🔍 验证结果
|
||||
|
||||
### 导入测试
|
||||
|
||||
```bash
|
||||
# ✅ 通过 - 语义模块正常
|
||||
python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)"
|
||||
# Output: True
|
||||
|
||||
# ✅ 通过 - 搜索引擎正常
|
||||
python -c "from codexlens.search.hybrid_search import HybridSearchEngine; print('OK')"
|
||||
# Output: OK
|
||||
```
|
||||
|
||||
### 代码清洁度验证
|
||||
|
||||
```bash
|
||||
# ✅ 通过 - 无遗留LLM引用
|
||||
grep -r "llm_enhancer\|LLMEnhancer\|LLMConfig" src/ --include="*.py"
|
||||
# Output: (空)
|
||||
```
|
||||
|
||||
### 测试结果
|
||||
|
||||
```bash
|
||||
# ✅ 5/7通过 - 纯向量搜索基本功能正常
|
||||
pytest tests/test_pure_vector_search.py -v
|
||||
# 通过: 5个基本测试
|
||||
# 失败: 2个嵌入测试(已知的模型维度不匹配问题,与LLM移除无关)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 统计
|
||||
|
||||
### 代码删除统计
|
||||
|
||||
| 类型 | 删除文件数 | 删除行数(估计) |
|
||||
|------|-----------|-----------------|
|
||||
| **源代码** | 1 | ~900 lines |
|
||||
| **CLI命令** | 1 command | ~180 lines |
|
||||
| **导出清理** | 1 section | ~35 lines |
|
||||
| **前端代码** | 3 files | ~1000 lines |
|
||||
| **测试文件** | 2 | ~600 lines |
|
||||
| **脚本工具** | 4 | ~1500 lines |
|
||||
| **文档** | 5 | ~2000 lines |
|
||||
| **总计** | 16 files/sections | ~6200 lines |
|
||||
|
||||
### 依赖简化
|
||||
|
||||
| 方面 | 移除前 | 移除后 |
|
||||
|------|--------|--------|
|
||||
| **外部工具依赖** | CCW CLI, Gemini/Qwen | 无 |
|
||||
| **Python包依赖** | fastembed, numpy | fastembed, numpy |
|
||||
| **API依赖** | Gemini/Qwen API | 无 |
|
||||
| **配置复杂度** | 高(tool, batch_size, API keys) | 低(model profile) |
|
||||
|
||||
---
|
||||
|
||||
## 🚀 后续建议
|
||||
|
||||
### 如果需要LLM增强功能
|
||||
|
||||
1. **从git历史恢复**
|
||||
```bash
|
||||
# 查看删除前的提交
|
||||
git log --all --full-history -- "*llm_enhancer*"
|
||||
|
||||
# 恢复特定文件
|
||||
git checkout <commit-hash> -- src/codexlens/semantic/llm_enhancer.py
|
||||
```
|
||||
|
||||
2. **或使用外部工具**
|
||||
- 在索引前使用独立脚本生成摘要
|
||||
- 将摘要作为注释添加到代码中
|
||||
- 然后使用纯向量索引(会包含摘要)
|
||||
|
||||
3. **或考虑轻量级替代方案**
|
||||
- 使用本地小模型(llama.cpp, ggml)
|
||||
- 使用docstring提取(无需LLM)
|
||||
- 使用静态分析生成摘要
|
||||
|
||||
### 代码库维护建议
|
||||
|
||||
1. ✅ **保持简单** - 继续使用纯向量搜索
|
||||
2. ✅ **优化现有功能** - 改进向量搜索准确性
|
||||
3. ✅ **增量改进** - 优化分块策略和嵌入质量
|
||||
4. ⚠️ **避免重复** - 如需LLM,先评估是否真正必要
|
||||
|
||||
---
|
||||
|
||||
## 📝 文件清单
|
||||
|
||||
### 删除的文件完整列表
|
||||
|
||||
```
|
||||
src/codexlens/semantic/llm_enhancer.py
|
||||
tests/test_llm_enhancer.py
|
||||
tests/test_llm_enhanced_search.py
|
||||
scripts/compare_search_methods.py
|
||||
scripts/test_misleading_comments.py
|
||||
scripts/show_llm_analysis.py
|
||||
scripts/inspect_llm_summaries.py
|
||||
docs/LLM_ENHANCED_SEARCH_GUIDE.md
|
||||
docs/LLM_ENHANCEMENT_TEST_RESULTS.md
|
||||
docs/MISLEADING_COMMENTS_TEST_RESULTS.md
|
||||
docs/CLI_INTEGRATION_SUMMARY.md
|
||||
docs/DOCSTRING_LLM_HYBRID_DESIGN.md
|
||||
```
|
||||
|
||||
### 修改的文件
|
||||
|
||||
```
|
||||
src/codexlens/cli/commands.py (删除enhance命令)
|
||||
src/codexlens/semantic/__init__.py (删除LLM导出)
|
||||
ccw/src/templates/dashboard-js/components/cli-status.js (删除LLM配置、Settings Modal、Metadata Viewer)
|
||||
ccw/src/templates/dashboard-js/i18n.js (删除LLM翻译字符串)
|
||||
ccw/src/templates/dashboard-js/views/cli-manager.js (移除LLM badge和modal调用)
|
||||
docs/IMPLEMENTATION_SUMMARY.md (添加移除说明)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**移除完成时间**: 2025-12-16
|
||||
**文档版本**: 1.0
|
||||
**验证状态**: ✅ 通过
|
||||
@@ -1,316 +0,0 @@
|
||||
# codex-lens LSP Integration Execution Checklist
|
||||
|
||||
> Generated: 2026-01-15
|
||||
> Based on: Gemini multi-round deep analysis
|
||||
> Status: Ready for implementation
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: LSP Server Foundation (Priority: HIGH)
|
||||
|
||||
### 1.1 Create LSP Server Entry Point
|
||||
- [ ] **Install pygls dependency**
|
||||
```bash
|
||||
pip install pygls
|
||||
```
|
||||
- [ ] **Create `src/codexlens/lsp/__init__.py`**
|
||||
- Export: `CodexLensServer`, `start_server`
|
||||
- [ ] **Create `src/codexlens/lsp/server.py`**
|
||||
- Class: `CodexLensServer(LanguageServer)`
|
||||
- Initialize: `ChainSearchEngine`, `GlobalSymbolIndex`, `WatcherManager`
|
||||
- Lifecycle: Start `WatcherManager` on `initialize` request
|
||||
|
||||
### 1.2 Implement Core LSP Handlers
|
||||
- [ ] **`textDocument/definition`** handler
|
||||
- Source: `GlobalSymbolIndex.search()` exact match
|
||||
- Reference: `storage/global_index.py:173`
|
||||
- Return: `Location(uri, Range)`
|
||||
|
||||
- [ ] **`textDocument/completion`** handler
|
||||
- Source: `GlobalSymbolIndex.search(prefix_mode=True)`
|
||||
- Reference: `storage/global_index.py:173`
|
||||
- Return: `CompletionItem[]`
|
||||
|
||||
- [ ] **`workspace/symbol`** handler
|
||||
- Source: `ChainSearchEngine.search_symbols()`
|
||||
- Reference: `search/chain_search.py:618`
|
||||
- Return: `SymbolInformation[]`
|
||||
|
||||
### 1.3 Wire File Watcher to LSP Events
|
||||
- [ ] **`workspace/didChangeWatchedFiles`** handler
|
||||
- Delegate to: `WatcherManager.process_changes()`
|
||||
- Reference: `watcher/manager.py:53`
|
||||
|
||||
- [ ] **`textDocument/didSave`** handler
|
||||
- Trigger: `IncrementalIndexer` for single file
|
||||
- Reference: `watcher/incremental_indexer.py`
|
||||
|
||||
### 1.4 Deliverables
|
||||
- [ ] Unit tests for LSP handlers
|
||||
- [ ] Integration test: definition lookup
|
||||
- [ ] Integration test: completion prefix search
|
||||
- [ ] Benchmark: query latency < 50ms
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Find References Implementation (Priority: MEDIUM)
|
||||
|
||||
### 2.1 Create `search_references` Method
|
||||
- [ ] **Add to `src/codexlens/search/chain_search.py`**
|
||||
```python
|
||||
def search_references(
|
||||
self,
|
||||
symbol_name: str,
|
||||
source_path: Path,
|
||||
depth: int = -1
|
||||
) -> List[ReferenceResult]:
|
||||
"""Find all references to a symbol across the project."""
|
||||
```
|
||||
|
||||
### 2.2 Implement Parallel Query Orchestration
|
||||
- [ ] **Collect index paths**
|
||||
- Use: `_collect_index_paths()` existing method
|
||||
|
||||
- [ ] **Parallel query execution**
|
||||
- ThreadPoolExecutor across all `_index.db`
|
||||
- SQL: `SELECT * FROM code_relationships WHERE target_qualified_name = ?`
|
||||
- Reference: `storage/sqlite_store.py:348`
|
||||
|
||||
- [ ] **Result aggregation**
|
||||
- Deduplicate by file:line
|
||||
- Sort by file path, then line number
|
||||
|
||||
### 2.3 LSP Handler
|
||||
- [ ] **`textDocument/references`** handler
|
||||
- Call: `ChainSearchEngine.search_references()`
|
||||
- Return: `Location[]`
|
||||
|
||||
### 2.4 Deliverables
|
||||
- [ ] Unit test: single-index reference lookup
|
||||
- [ ] Integration test: cross-directory references
|
||||
- [ ] Benchmark: < 200ms for 10+ index files
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Enhanced Hover Information (Priority: MEDIUM)
|
||||
|
||||
### 3.1 Implement Hover Data Extraction
|
||||
- [ ] **Create `src/codexlens/lsp/hover_provider.py`**
|
||||
```python
|
||||
class HoverProvider:
|
||||
def get_hover_info(self, symbol: Symbol) -> HoverInfo:
|
||||
"""Extract hover information for a symbol."""
|
||||
```
|
||||
|
||||
### 3.2 Data Sources
|
||||
- [ ] **Symbol metadata**
|
||||
- Source: `GlobalSymbolIndex.search()`
|
||||
- Fields: `kind`, `name`, `file_path`, `range`
|
||||
|
||||
- [ ] **Source code extraction**
|
||||
- Source: `SQLiteStore.files` table
|
||||
- Reference: `storage/sqlite_store.py:284`
|
||||
- Extract: Lines from `range[0]` to `range[1]`
|
||||
|
||||
### 3.3 LSP Handler
|
||||
- [ ] **`textDocument/hover`** handler
|
||||
- Return: `Hover(contents=MarkupContent)`
|
||||
- Format: Markdown with code fence
|
||||
|
||||
### 3.4 Deliverables
|
||||
- [ ] Unit test: hover for function/class/variable
|
||||
- [ ] Integration test: multi-line function signature
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: MCP Bridge for Claude Code (Priority: HIGH VALUE)
|
||||
|
||||
### 4.1 Define MCP Schema
|
||||
- [ ] **Create `src/codexlens/mcp/__init__.py`**
|
||||
- [ ] **Create `src/codexlens/mcp/schema.py`**
|
||||
```python
|
||||
@dataclass
|
||||
class MCPContext:
|
||||
version: str = "1.0"
|
||||
context_type: str
|
||||
symbol: Optional[SymbolInfo]
|
||||
definition: Optional[str]
|
||||
references: List[ReferenceInfo]
|
||||
related_symbols: List[SymbolInfo]
|
||||
```
|
||||
|
||||
### 4.2 Create MCP Provider
|
||||
- [ ] **Create `src/codexlens/mcp/provider.py`**
|
||||
```python
|
||||
class MCPProvider:
|
||||
def build_context(
|
||||
self,
|
||||
symbol_name: str,
|
||||
context_type: str = "symbol_explanation"
|
||||
) -> MCPContext:
|
||||
"""Build structured context for LLM consumption."""
|
||||
```
|
||||
|
||||
### 4.3 Context Building Logic
|
||||
- [ ] **Symbol lookup**
|
||||
- Use: `GlobalSymbolIndex.search()`
|
||||
|
||||
- [ ] **Definition extraction**
|
||||
- Use: `SQLiteStore` file content
|
||||
|
||||
- [ ] **References collection**
|
||||
- Use: `ChainSearchEngine.search_references()`
|
||||
|
||||
- [ ] **Related symbols**
|
||||
- Use: `code_relationships` for imports/calls
|
||||
|
||||
### 4.4 Hook Integration Points
|
||||
- [ ] **Document `pre-tool` hook interface**
|
||||
```python
|
||||
def pre_tool_hook(action: str, params: dict) -> MCPContext:
|
||||
"""Called before LLM action to gather context."""
|
||||
```
|
||||
|
||||
- [ ] **Document `post-tool` hook interface**
|
||||
```python
|
||||
def post_tool_hook(action: str, result: Any) -> None:
|
||||
"""Called after LSP action for proactive caching."""
|
||||
```
|
||||
|
||||
### 4.5 Deliverables
|
||||
- [ ] MCP schema JSON documentation
|
||||
- [ ] Unit test: context building
|
||||
- [ ] Integration test: hook → MCP → JSON output
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Advanced Features (Priority: LOW)
|
||||
|
||||
### 5.1 Custom LSP Commands
|
||||
- [ ] **`codexlens/hybridSearch`**
|
||||
- Expose: `HybridSearchEngine.search()`
|
||||
- Reference: `search/hybrid_search.py`
|
||||
|
||||
- [ ] **`codexlens/symbolGraph`**
|
||||
- Return: Symbol relationship graph
|
||||
- Source: `code_relationships` table
|
||||
|
||||
### 5.2 Proactive Context Caching
|
||||
- [ ] **Implement `post-tool` hook caching**
|
||||
- After `go-to-definition`: pre-fetch references
|
||||
- Cache TTL: 5 minutes
|
||||
- Storage: In-memory LRU
|
||||
|
||||
### 5.3 Performance Optimizations
|
||||
- [ ] **Connection pooling**
|
||||
- Reference: `storage/sqlite_store.py` thread-local
|
||||
|
||||
- [ ] **Result caching**
|
||||
- LRU cache for frequent queries
|
||||
- Invalidate on file change
|
||||
|
||||
---
|
||||
|
||||
## File Structure After Implementation
|
||||
|
||||
```
|
||||
src/codexlens/
|
||||
├── lsp/ # NEW
|
||||
│ ├── __init__.py
|
||||
│ ├── server.py # Main LSP server
|
||||
│ ├── handlers.py # LSP request handlers
|
||||
│ ├── hover_provider.py # Hover information
|
||||
│ └── utils.py # LSP utilities
|
||||
│
|
||||
├── mcp/ # NEW
|
||||
│ ├── __init__.py
|
||||
│ ├── schema.py # MCP data models
|
||||
│ ├── provider.py # Context builder
|
||||
│ └── hooks.py # Hook interfaces
|
||||
│
|
||||
├── search/
|
||||
│ ├── chain_search.py # MODIFY: add search_references()
|
||||
│ └── ...
|
||||
│
|
||||
└── ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies to Add
|
||||
|
||||
```toml
|
||||
# pyproject.toml
|
||||
[project.optional-dependencies]
|
||||
lsp = [
|
||||
"pygls>=1.3.0",
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
```
|
||||
tests/
|
||||
├── lsp/
|
||||
│ ├── test_definition.py
|
||||
│ ├── test_completion.py
|
||||
│ ├── test_references.py
|
||||
│ └── test_hover.py
|
||||
│
|
||||
└── mcp/
|
||||
├── test_schema.py
|
||||
└── test_provider.py
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
- [ ] Full LSP handshake test
|
||||
- [ ] Multi-file project navigation
|
||||
- [ ] Incremental index update via didSave
|
||||
|
||||
### Performance Benchmarks
|
||||
| Operation | Target | Acceptable |
|
||||
|-----------|--------|------------|
|
||||
| Definition lookup | < 30ms | < 50ms |
|
||||
| Completion (100 items) | < 50ms | < 100ms |
|
||||
| Find references (10 files) | < 150ms | < 200ms |
|
||||
| Initial indexing (1000 files) | < 60s | < 120s |
|
||||
|
||||
---
|
||||
|
||||
## Execution Order
|
||||
|
||||
```
|
||||
Week 1: Phase 1.1 → 1.2 → 1.3 → 1.4
|
||||
Week 2: Phase 2.1 → 2.2 → 2.3 → 2.4
|
||||
Week 3: Phase 3 + Phase 4.1 → 4.2
|
||||
Week 4: Phase 4.3 → 4.4 → 4.5
|
||||
Week 5: Phase 5 (optional) + Polish
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Start Commands
|
||||
|
||||
```bash
|
||||
# Install LSP dependencies
|
||||
pip install pygls
|
||||
|
||||
# Run LSP server (after implementation)
|
||||
python -m codexlens.lsp --stdio
|
||||
|
||||
# Test LSP connection
|
||||
echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' | python -m codexlens.lsp --stdio
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Reference Links
|
||||
|
||||
- pygls Documentation: https://pygls.readthedocs.io/
|
||||
- LSP Specification: https://microsoft.github.io/language-server-protocol/
|
||||
- codex-lens GlobalSymbolIndex: `storage/global_index.py:173`
|
||||
- codex-lens ChainSearchEngine: `search/chain_search.py:618`
|
||||
- codex-lens WatcherManager: `watcher/manager.py:53`
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,284 +0,0 @@
|
||||
# CodexLens MCP Endpoint Design
|
||||
|
||||
> Generated by Gemini Analysis | 2026-01-19
|
||||
> Document Version: 1.0
|
||||
|
||||
## Overview
|
||||
|
||||
This document provides the complete MCP endpoint design for exposing codex-lens LSP capabilities through the Model Context Protocol.
|
||||
|
||||
## Related Files
|
||||
- `src/codexlens/lsp/server.py` - Main LSP server initialization, component management, and capability declaration.
|
||||
- `src/codexlens/lsp/handlers.py` - Implementation of handlers for core LSP requests (definition, references, completion, hover, workspace symbols).
|
||||
- `src/codexlens/lsp/providers.py` - Helper classes, specifically `HoverProvider` for generating rich hover information.
|
||||
- `src/codexlens/storage/global_index.py` - The backing data store (`GlobalSymbolIndex`) that powers most of the symbol lookups.
|
||||
- `src/codexlens/search/__init__.py` - Exposes the `ChainSearchEngine`, used for advanced reference searching.
|
||||
|
||||
## Summary
|
||||
|
||||
The `codex-lens` LSP implementation exposes five core code navigation and search features: go to definition, find references, code completion, hover information, and workspace symbol search. These features are primarily powered by two components: `GlobalSymbolIndex` for fast, project-wide symbol lookups (used by definition, completion, hover, and workspace symbols) and `ChainSearchEngine` for advanced, relationship-aware reference finding.
|
||||
|
||||
The following MCP tool design externalizes these backend capabilities, allowing a client to leverage the same code intelligence features outside of an LSP context.
|
||||
|
||||
## MCP Tool Group: `code.symbol`
|
||||
|
||||
This group provides tools for searching and retrieving information about code symbols (functions, classes, etc.) within an indexed project.
|
||||
|
||||
---
|
||||
|
||||
### 1. `code.symbol.search`
|
||||
|
||||
**Description**: Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions.
|
||||
|
||||
**Mapped LSP Features**: `workspace/symbol`, `textDocument/completion`
|
||||
|
||||
**Backend Implementation**: This tool directly maps to the `GlobalSymbolIndex.search` method.
|
||||
- Reference: `src/codexlens/lsp/handlers.py:302` (in `lsp_workspace_symbol`)
|
||||
- Reference: `src/codexlens/lsp/handlers.py:256` (in `lsp_completion`)
|
||||
|
||||
**Schema**:
|
||||
```json
|
||||
{
|
||||
"name": "code.symbol.search",
|
||||
"description": "Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The symbol name or prefix to search for."
|
||||
},
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"description": "Optional: Filter results to only include symbols of a specific kind (e.g., 'function', 'class', 'method').",
|
||||
"nullable": true
|
||||
},
|
||||
"prefix_mode": {
|
||||
"type": "boolean",
|
||||
"description": "If true, treats the query as a prefix (name LIKE 'query%'). If false, performs a contains search (name LIKE '%query%'). Defaults to true.",
|
||||
"default": true
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "The maximum number of symbols to return.",
|
||||
"default": 50
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Returns**:
|
||||
```typescript
|
||||
Array<{
|
||||
name: string; // The name of the symbol
|
||||
kind: string; // The kind of the symbol (e.g., 'function', 'class')
|
||||
file_path: string; // The absolute path to the file containing the symbol
|
||||
range: {
|
||||
start_line: number; // The 1-based starting line number
|
||||
end_line: number; // The 1-based ending line number
|
||||
}
|
||||
}>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. `code.symbol.findDefinition`
|
||||
|
||||
**Description**: Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature.
|
||||
|
||||
**Mapped LSP Feature**: `textDocument/definition`
|
||||
|
||||
**Backend Implementation**: This tool uses `GlobalSymbolIndex.search` with `prefix_mode=False` and then filters for an exact name match.
|
||||
- Reference: `src/codexlens/lsp/handlers.py:180` (in `lsp_definition`)
|
||||
|
||||
**Schema**:
|
||||
```json
|
||||
{
|
||||
"name": "code.symbol.findDefinition",
|
||||
"description": "Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"symbol_name": {
|
||||
"type": "string",
|
||||
"description": "The exact name of the symbol to find."
|
||||
},
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"description": "Optional: Disambiguate by providing the symbol kind (e.g., 'function', 'class').",
|
||||
"nullable": true
|
||||
}
|
||||
},
|
||||
"required": ["symbol_name"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Returns**:
|
||||
```typescript
|
||||
Array<{
|
||||
name: string; // The name of the symbol
|
||||
kind: string; // The kind of the symbol
|
||||
file_path: string; // The absolute path to the file
|
||||
range: {
|
||||
start_line: number; // The 1-based starting line number
|
||||
end_line: number; // The 1-based ending line number
|
||||
}
|
||||
}>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. `code.symbol.findReferences`
|
||||
|
||||
**Description**: Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible, falling back to name-based search.
|
||||
|
||||
**Mapped LSP Feature**: `textDocument/references`
|
||||
|
||||
**Backend Implementation**: This primarily uses `ChainSearchEngine.search_references` for accuracy, which is more powerful than a simple name search.
|
||||
- Reference: `src/codexlens/lsp/handlers.py:218` (in `lsp_references`)
|
||||
|
||||
**Schema**:
|
||||
```json
|
||||
{
|
||||
"name": "code.symbol.findReferences",
|
||||
"description": "Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"symbol_name": {
|
||||
"type": "string",
|
||||
"description": "The name of the symbol to find references for."
|
||||
},
|
||||
"context_path": {
|
||||
"type": "string",
|
||||
"description": "The source path of the current project or workspace root to provide context for the search."
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "The maximum number of references to return.",
|
||||
"default": 200
|
||||
}
|
||||
},
|
||||
"required": ["symbol_name", "context_path"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Returns**:
|
||||
```typescript
|
||||
Array<{
|
||||
file_path: string; // The absolute path to the file containing the reference
|
||||
line: number; // The 1-based line number of the reference
|
||||
column: number; // The 0-based starting column of the reference
|
||||
}>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. `code.symbol.getHoverInfo`
|
||||
|
||||
**Description**: Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card.
|
||||
|
||||
**Mapped LSP Feature**: `textDocument/hover`
|
||||
|
||||
**Backend Implementation**: This tool encapsulates the logic from `HoverProvider`, which finds a symbol in `GlobalSymbolIndex` and then reads the source file to extract its signature.
|
||||
- Reference: `src/codexlens/lsp/handlers.py:285` (instantiates `HoverProvider`)
|
||||
- Reference: `src/codexlens/lsp/providers.py:53` (in `HoverProvider.get_hover_info`)
|
||||
|
||||
**Schema**:
|
||||
```json
|
||||
{
|
||||
"name": "code.symbol.getHoverInfo",
|
||||
"description": "Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"symbol_name": {
|
||||
"type": "string",
|
||||
"description": "The exact name of the symbol to get hover information for."
|
||||
}
|
||||
},
|
||||
"required": ["symbol_name"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Returns**:
|
||||
```typescript
|
||||
{
|
||||
name: string; // The name of the symbol
|
||||
kind: string; // The kind of the symbol
|
||||
signature: string; // The full code signature as extracted from source
|
||||
file_path: string; // The absolute path to the file
|
||||
start_line: number; // The 1-based starting line number
|
||||
} | null // null if symbol not found
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration with CCW MCP Manager
|
||||
|
||||
The `codex-lens-tools` MCP server should be added to the recommended MCP servers list in `ccw/src/templates/dashboard-js/components/mcp-manager.js`:
|
||||
|
||||
```javascript
|
||||
{
|
||||
id: 'codex-lens-tools',
|
||||
nameKey: 'mcp.codexLens.name',
|
||||
descKey: 'mcp.codexLens.desc',
|
||||
icon: 'search-code',
|
||||
category: 'code-intelligence',
|
||||
fields: [
|
||||
{
|
||||
key: 'toolSelection',
|
||||
labelKey: 'mcp.codexLens.field.tools',
|
||||
type: 'multi-select',
|
||||
options: [
|
||||
{ value: 'symbol.search', label: 'Symbol Search' },
|
||||
{ value: 'symbol.findDefinition', label: 'Find Definition' },
|
||||
{ value: 'symbol.findReferences', label: 'Find References' },
|
||||
{ value: 'symbol.getHoverInfo', label: 'Hover Information' }
|
||||
],
|
||||
default: ['symbol.search', 'symbol.findDefinition', 'symbol.findReferences'],
|
||||
required: true,
|
||||
descKey: 'mcp.codexLens.field.tools.desc'
|
||||
}
|
||||
],
|
||||
buildConfig: (values) => {
|
||||
const tools = values.toolSelection || [];
|
||||
const env = { CODEXLENS_ENABLED_TOOLS: tools.join(',') };
|
||||
return buildCrossPlatformMcpConfig('npx', ['-y', 'codex-lens-mcp'], { env });
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Tool Naming Convention
|
||||
|
||||
- **Namespace**: `code.*` for code intelligence tools
|
||||
- **Category**: `symbol` for symbol-related operations
|
||||
- **Operation**: Descriptive verb (search, findDefinition, findReferences, getHoverInfo)
|
||||
- **Full Pattern**: `code.symbol.<operation>`
|
||||
|
||||
This naming scheme aligns with MCP conventions and is easily extensible for future categories (e.g., `code.types.*`, `code.imports.*`).
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Document Symbol Tool** (`code.symbol.getDocumentSymbols`)
|
||||
- Maps LSP `textDocument/documentSymbol`
|
||||
- Returns all symbols in a specific file
|
||||
|
||||
2. **Type Information** (`code.type.*` group)
|
||||
- Type definitions and relationships
|
||||
- Generic resolution
|
||||
|
||||
3. **Relationship Analysis** (`code.relation.*` group)
|
||||
- Call hierarchy
|
||||
- Inheritance chains
|
||||
- Import dependencies
|
||||
|
||||
---
|
||||
|
||||
Generated: 2026-01-19
|
||||
Status: Ready for Implementation
|
||||
@@ -1,220 +0,0 @@
|
||||
# Migration 005: Database Schema Cleanup
|
||||
|
||||
## Overview
|
||||
|
||||
Migration 005 removes four unused and redundant database fields identified through Gemini analysis. This cleanup improves database efficiency, reduces schema complexity, and eliminates potential data consistency issues.
|
||||
|
||||
## Schema Version
|
||||
|
||||
- **Previous Version**: 4
|
||||
- **New Version**: 5
|
||||
|
||||
## Changes Summary
|
||||
|
||||
### 1. Removed `semantic_metadata.keywords` Column
|
||||
|
||||
**Reason**: Deprecated - replaced by normalized `file_keywords` table in migration 001.
|
||||
|
||||
**Impact**:
|
||||
- Keywords are now exclusively read from the normalized `file_keywords` table
|
||||
- Prevents data sync issues between JSON column and normalized tables
|
||||
- No data loss - migration 001 already populated `file_keywords` table
|
||||
|
||||
**Modified Code**:
|
||||
- `get_semantic_metadata()`: Now reads keywords from `file_keywords` JOIN
|
||||
- `list_semantic_metadata()`: Updated to query `file_keywords` for each result
|
||||
- `add_semantic_metadata()`: Stopped writing to `keywords` column (only writes to `file_keywords`)
|
||||
|
||||
### 2. Removed `symbols.token_count` Column
|
||||
|
||||
**Reason**: Unused - always NULL, never populated.
|
||||
|
||||
**Impact**:
|
||||
- No data loss (column was never used)
|
||||
- Reduces symbols table size
|
||||
- Simplifies symbol insertion logic
|
||||
|
||||
**Modified Code**:
|
||||
- `add_file()`: Removed `token_count` from INSERT statements
|
||||
- `update_file_symbols()`: Removed `token_count` from INSERT statements
|
||||
- Schema creation: No longer creates `token_count` column
|
||||
|
||||
### 3. Removed `symbols.symbol_type` Column
|
||||
|
||||
**Reason**: Redundant - duplicates `symbols.kind` field.
|
||||
|
||||
**Impact**:
|
||||
- No data loss (information preserved in `kind` column)
|
||||
- Reduces symbols table size
|
||||
- Eliminates redundant data storage
|
||||
|
||||
**Modified Code**:
|
||||
- `add_file()`: Removed `symbol_type` from INSERT statements
|
||||
- `update_file_symbols()`: Removed `symbol_type` from INSERT statements
|
||||
- Schema creation: No longer creates `symbol_type` column
|
||||
- Removed `idx_symbols_type` index
|
||||
|
||||
### 4. Removed `subdirs.direct_files` Column
|
||||
|
||||
**Reason**: Unused - never displayed or queried in application logic.
|
||||
|
||||
**Impact**:
|
||||
- No data loss (column was never used)
|
||||
- Reduces subdirs table size
|
||||
- Simplifies subdirectory registration
|
||||
|
||||
**Modified Code**:
|
||||
- `register_subdir()`: Parameter kept for backward compatibility but ignored
|
||||
- `update_subdir_stats()`: Parameter kept for backward compatibility but ignored
|
||||
- `get_subdirs()`: No longer retrieves `direct_files`
|
||||
- `get_subdir()`: No longer retrieves `direct_files`
|
||||
- `SubdirLink` dataclass: Removed `direct_files` field
|
||||
|
||||
## Migration Process
|
||||
|
||||
### Automatic Migration (v4 → v5)
|
||||
|
||||
When an existing database (version 4) is opened:
|
||||
|
||||
1. **Transaction begins**
|
||||
2. **Step 1**: Recreate `semantic_metadata` table without `keywords` column
|
||||
- Data copied from old table (excluding `keywords`)
|
||||
- Old table dropped, new table renamed
|
||||
3. **Step 2**: Recreate `symbols` table without `token_count` and `symbol_type`
|
||||
- Data copied from old table (excluding removed columns)
|
||||
- Old table dropped, new table renamed
|
||||
- Indexes recreated (excluding `idx_symbols_type`)
|
||||
4. **Step 3**: Recreate `subdirs` table without `direct_files`
|
||||
- Data copied from old table (excluding `direct_files`)
|
||||
- Old table dropped, new table renamed
|
||||
5. **Transaction committed**
|
||||
6. **VACUUM** runs to reclaim space (non-critical, continues if fails)
|
||||
|
||||
### New Database Creation (v5)
|
||||
|
||||
New databases are created directly with the clean schema (no migration needed).
|
||||
|
||||
## Benefits
|
||||
|
||||
1. **Reduced Database Size**: Removed 4 unused columns across 3 tables
|
||||
2. **Improved Data Consistency**: Single source of truth for keywords (normalized tables)
|
||||
3. **Simpler Code**: Less maintenance burden for unused fields
|
||||
4. **Better Performance**: Smaller table sizes, fewer indexes to maintain
|
||||
5. **Cleaner Schema**: Easier to understand and maintain
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
### API Compatibility
|
||||
|
||||
All public APIs remain backward compatible:
|
||||
|
||||
- `register_subdir()` and `update_subdir_stats()` still accept `direct_files` parameter (ignored)
|
||||
- `SubdirLink` dataclass no longer has `direct_files` attribute (breaking change for direct dataclass access)
|
||||
|
||||
### Database Compatibility
|
||||
|
||||
- **v4 databases**: Automatically migrated to v5 on first access
|
||||
- **v5 databases**: No migration needed
|
||||
- **Older databases (v0-v3)**: Migrate through chain (v0→v2→v4→v5)
|
||||
|
||||
## Testing
|
||||
|
||||
Comprehensive test suite added: `tests/test_schema_cleanup_migration.py`
|
||||
|
||||
**Test Coverage**:
|
||||
- ✅ Migration from v4 to v5
|
||||
- ✅ New database creation with clean schema
|
||||
- ✅ Semantic metadata keywords read from normalized table
|
||||
- ✅ Symbols insert without deprecated fields
|
||||
- ✅ Subdir operations without `direct_files`
|
||||
|
||||
**Test Results**: All 5 tests passing
|
||||
|
||||
## Verification
|
||||
|
||||
To verify migration success:
|
||||
|
||||
```python
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
|
||||
store = DirIndexStore("path/to/_index.db")
|
||||
store.initialize()
|
||||
|
||||
# Check schema version
|
||||
conn = store._get_connection()
|
||||
version = conn.execute("PRAGMA user_version").fetchone()[0]
|
||||
assert version == 5
|
||||
|
||||
# Check columns removed
|
||||
cursor = conn.execute("PRAGMA table_info(semantic_metadata)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
assert "keywords" not in columns
|
||||
|
||||
cursor = conn.execute("PRAGMA table_info(symbols)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
assert "token_count" not in columns
|
||||
assert "symbol_type" not in columns
|
||||
|
||||
cursor = conn.execute("PRAGMA table_info(subdirs)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
assert "direct_files" not in columns
|
||||
|
||||
store.close()
|
||||
```
|
||||
|
||||
## Performance Impact
|
||||
|
||||
**Expected Improvements**:
|
||||
- Database size reduction: ~10-15% (varies by data)
|
||||
- VACUUM reclaims space immediately after migration
|
||||
- Slightly faster queries (smaller tables, fewer indexes)
|
||||
|
||||
## Rollback
|
||||
|
||||
Migration 005 is **one-way** (no downgrade function). Removed fields contain:
|
||||
- `keywords`: Already migrated to normalized tables (migration 001)
|
||||
- `token_count`: Always NULL (no data)
|
||||
- `symbol_type`: Duplicate of `kind` (no data loss)
|
||||
- `direct_files`: Never used (no data)
|
||||
|
||||
If rollback is needed, restore from backup before running migration.
|
||||
|
||||
## Files Modified
|
||||
|
||||
1. **Migration File**:
|
||||
- `src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py` (NEW)
|
||||
|
||||
2. **Core Storage**:
|
||||
- `src/codexlens/storage/dir_index.py`:
|
||||
- Updated `SCHEMA_VERSION` to 5
|
||||
- Added migration 005 to `_apply_migrations()`
|
||||
- Updated `get_semantic_metadata()` to read from `file_keywords`
|
||||
- Updated `list_semantic_metadata()` to read from `file_keywords`
|
||||
- Updated `add_semantic_metadata()` to not write `keywords` column
|
||||
- Updated `add_file()` to not write `token_count`/`symbol_type`
|
||||
- Updated `update_file_symbols()` to not write `token_count`/`symbol_type`
|
||||
- Updated `register_subdir()` to not write `direct_files`
|
||||
- Updated `update_subdir_stats()` to not write `direct_files`
|
||||
- Updated `get_subdirs()` to not read `direct_files`
|
||||
- Updated `get_subdir()` to not read `direct_files`
|
||||
- Updated `SubdirLink` dataclass to remove `direct_files`
|
||||
- Updated `_create_schema()` to create v5 schema directly
|
||||
|
||||
3. **Tests**:
|
||||
- `tests/test_schema_cleanup_migration.py` (NEW)
|
||||
|
||||
## Deployment Checklist
|
||||
|
||||
- [x] Migration script created and tested
|
||||
- [x] Schema version updated to 5
|
||||
- [x] All code updated to use new schema
|
||||
- [x] Comprehensive tests added
|
||||
- [x] Existing tests pass
|
||||
- [x] Documentation updated
|
||||
- [x] Backward compatibility verified
|
||||
|
||||
## References
|
||||
|
||||
- Original Analysis: Gemini code review identified unused/redundant fields
|
||||
- Migration Pattern: Follows SQLite best practices (table recreation)
|
||||
- Previous Migrations: 001 (keywords normalization), 004 (dual FTS)
|
||||
@@ -1,973 +0,0 @@
|
||||
# 多层次分词器设计方案
|
||||
|
||||
## 1. 背景与目标
|
||||
|
||||
### 1.1 当前问题
|
||||
|
||||
当前 `chunker.py` 的两种分词策略存在明显缺陷:
|
||||
|
||||
**symbol-based 策略**:
|
||||
- ✅ 优点:保持代码逻辑完整性,每个chunk是完整的函数/类
|
||||
- ❌ 缺点:粒度不均,超大函数可能达到数百行,影响LLM处理和搜索精度
|
||||
|
||||
**sliding-window 策略**:
|
||||
- ✅ 优点:chunk大小均匀,覆盖全面
|
||||
- ❌ 缺点:破坏逻辑结构,可能将完整的循环/条件块切断
|
||||
|
||||
### 1.2 设计目标
|
||||
|
||||
实现多层次分词器,同时满足:
|
||||
1. **语义完整性**:保持代码逻辑边界的完整性
|
||||
2. **粒度可控**:支持从粗粒度(函数级)到细粒度(逻辑块级)的灵活划分
|
||||
3. **层级关系**:保留chunk之间的父子关系,支持上下文检索
|
||||
4. **高效索引**:优化向量化和检索性能
|
||||
|
||||
## 2. 技术架构
|
||||
|
||||
### 2.1 两层分词架构
|
||||
|
||||
```
|
||||
Source Code
|
||||
↓
|
||||
[Layer 1: Symbol-Level Chunking] ← 使用 tree-sitter AST
|
||||
↓
|
||||
MacroChunks (Functions/Classes)
|
||||
↓
|
||||
[Layer 2: Logic-Block Chunking] ← AST深度遍历
|
||||
↓
|
||||
MicroChunks (Loops/Conditionals/Blocks)
|
||||
↓
|
||||
Vector Embedding + Indexing
|
||||
```
|
||||
|
||||
### 2.2 核心组件
|
||||
|
||||
```python
|
||||
# 新增数据结构
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
"""Chunk元数据"""
|
||||
chunk_id: str
|
||||
parent_id: Optional[str] # 父chunk ID
|
||||
level: int # 层级:1=macro, 2=micro
|
||||
chunk_type: str # function/class/loop/conditional/try_except
|
||||
file_path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
symbol_name: Optional[str]
|
||||
context_summary: Optional[str] # 继承自父chunk的上下文
|
||||
|
||||
@dataclass
|
||||
class HierarchicalChunk:
|
||||
"""层级化的代码块"""
|
||||
metadata: ChunkMetadata
|
||||
content: str
|
||||
embedding: Optional[List[float]] = None
|
||||
children: List['HierarchicalChunk'] = field(default_factory=list)
|
||||
```
|
||||
|
||||
## 3. 详细实现步骤
|
||||
|
||||
### 3.1 第一层:符号级分词(Macro-Chunking)
|
||||
|
||||
**实现思路**:复用现有 `code_extractor.py` 逻辑,增强元数据提取。
|
||||
|
||||
```python
|
||||
class MacroChunker:
|
||||
"""第一层分词器:提取顶层符号"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = Parser()
|
||||
# 加载语言grammar
|
||||
|
||||
def chunk_by_symbols(
|
||||
self,
|
||||
content: str,
|
||||
file_path: str,
|
||||
language: str
|
||||
) -> List[HierarchicalChunk]:
|
||||
"""提取顶层函数和类定义"""
|
||||
tree = self.parser.parse(bytes(content, 'utf-8'))
|
||||
root_node = tree.root_node
|
||||
|
||||
chunks = []
|
||||
for node in root_node.children:
|
||||
if node.type in ['function_definition', 'class_definition',
|
||||
'method_definition']:
|
||||
chunk = self._create_macro_chunk(node, content, file_path)
|
||||
chunks.append(chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
def _create_macro_chunk(
|
||||
self,
|
||||
node,
|
||||
content: str,
|
||||
file_path: str
|
||||
) -> HierarchicalChunk:
|
||||
"""从AST节点创建macro chunk"""
|
||||
start_line = node.start_point[0] + 1
|
||||
end_line = node.end_point[0] + 1
|
||||
|
||||
# 提取符号名称
|
||||
name_node = node.child_by_field_name('name')
|
||||
symbol_name = content[name_node.start_byte:name_node.end_byte]
|
||||
|
||||
# 提取完整代码(包含docstring和装饰器)
|
||||
chunk_content = self._extract_with_context(node, content)
|
||||
|
||||
metadata = ChunkMetadata(
|
||||
chunk_id=f"{file_path}:{start_line}",
|
||||
parent_id=None,
|
||||
level=1,
|
||||
chunk_type=node.type,
|
||||
file_path=file_path,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
symbol_name=symbol_name,
|
||||
)
|
||||
|
||||
return HierarchicalChunk(
|
||||
metadata=metadata,
|
||||
content=chunk_content,
|
||||
)
|
||||
|
||||
def _extract_with_context(self, node, content: str) -> str:
|
||||
"""提取代码,包含装饰器和docstring"""
|
||||
# 向上查找装饰器
|
||||
start_byte = node.start_byte
|
||||
prev_sibling = node.prev_sibling
|
||||
while prev_sibling and prev_sibling.type == 'decorator':
|
||||
start_byte = prev_sibling.start_byte
|
||||
prev_sibling = prev_sibling.prev_sibling
|
||||
|
||||
return content[start_byte:node.end_byte]
|
||||
```
|
||||
|
||||
### 3.2 第二层:逻辑块分词(Micro-Chunking)
|
||||
|
||||
**实现思路**:在每个macro chunk内部,按逻辑结构进一步划分。
|
||||
|
||||
```python
|
||||
class MicroChunker:
|
||||
"""第二层分词器:提取逻辑块"""
|
||||
|
||||
# 需要划分的逻辑块类型
|
||||
LOGIC_BLOCK_TYPES = {
|
||||
'for_statement',
|
||||
'while_statement',
|
||||
'if_statement',
|
||||
'try_statement',
|
||||
'with_statement',
|
||||
}
|
||||
|
||||
def chunk_logic_blocks(
|
||||
self,
|
||||
macro_chunk: HierarchicalChunk,
|
||||
content: str,
|
||||
max_lines: int = 50 # 大于此行数的macro chunk才进行二次划分
|
||||
) -> List[HierarchicalChunk]:
|
||||
"""在macro chunk内部提取逻辑块"""
|
||||
|
||||
# 小函数不需要二次划分
|
||||
total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line
|
||||
if total_lines <= max_lines:
|
||||
return []
|
||||
|
||||
tree = self.parser.parse(bytes(macro_chunk.content, 'utf-8'))
|
||||
root_node = tree.root_node
|
||||
|
||||
micro_chunks = []
|
||||
self._traverse_logic_blocks(
|
||||
root_node,
|
||||
macro_chunk,
|
||||
content,
|
||||
micro_chunks
|
||||
)
|
||||
|
||||
return micro_chunks
|
||||
|
||||
def _traverse_logic_blocks(
|
||||
self,
|
||||
node,
|
||||
parent_chunk: HierarchicalChunk,
|
||||
content: str,
|
||||
result: List[HierarchicalChunk]
|
||||
):
|
||||
"""递归遍历AST,提取逻辑块"""
|
||||
|
||||
if node.type in self.LOGIC_BLOCK_TYPES:
|
||||
micro_chunk = self._create_micro_chunk(
|
||||
node,
|
||||
parent_chunk,
|
||||
content
|
||||
)
|
||||
result.append(micro_chunk)
|
||||
parent_chunk.children.append(micro_chunk)
|
||||
|
||||
# 继续遍历子节点
|
||||
for child in node.children:
|
||||
self._traverse_logic_blocks(child, parent_chunk, content, result)
|
||||
|
||||
def _create_micro_chunk(
|
||||
self,
|
||||
node,
|
||||
parent_chunk: HierarchicalChunk,
|
||||
content: str
|
||||
) -> HierarchicalChunk:
|
||||
"""创建micro chunk"""
|
||||
|
||||
# 计算相对于文件的行号
|
||||
start_line = parent_chunk.metadata.start_line + node.start_point[0]
|
||||
end_line = parent_chunk.metadata.start_line + node.end_point[0]
|
||||
|
||||
chunk_content = content[node.start_byte:node.end_byte]
|
||||
|
||||
metadata = ChunkMetadata(
|
||||
chunk_id=f"{parent_chunk.metadata.chunk_id}:L{start_line}",
|
||||
parent_id=parent_chunk.metadata.chunk_id,
|
||||
level=2,
|
||||
chunk_type=node.type,
|
||||
file_path=parent_chunk.metadata.file_path,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
symbol_name=parent_chunk.metadata.symbol_name, # 继承父符号名
|
||||
context_summary=None, # 后续由LLM填充
|
||||
)
|
||||
|
||||
return HierarchicalChunk(
|
||||
metadata=metadata,
|
||||
content=chunk_content,
|
||||
)
|
||||
```
|
||||
|
||||
### 3.3 统一接口:多层次分词器
|
||||
|
||||
```python
|
||||
class HierarchicalChunker:
|
||||
"""多层次分词器统一接口"""
|
||||
|
||||
def __init__(self, config: ChunkConfig = None):
|
||||
self.config = config or ChunkConfig()
|
||||
self.macro_chunker = MacroChunker()
|
||||
self.micro_chunker = MicroChunker()
|
||||
|
||||
def chunk_file(
|
||||
self,
|
||||
content: str,
|
||||
file_path: str,
|
||||
language: str
|
||||
) -> List[HierarchicalChunk]:
|
||||
"""对文件进行多层次分词"""
|
||||
|
||||
# 第一层:符号级分词
|
||||
macro_chunks = self.macro_chunker.chunk_by_symbols(
|
||||
content, file_path, language
|
||||
)
|
||||
|
||||
# 第二层:逻辑块分词
|
||||
all_chunks = []
|
||||
for macro_chunk in macro_chunks:
|
||||
all_chunks.append(macro_chunk)
|
||||
|
||||
# 对大函数进行二次划分
|
||||
micro_chunks = self.micro_chunker.chunk_logic_blocks(
|
||||
macro_chunk, content
|
||||
)
|
||||
all_chunks.extend(micro_chunks)
|
||||
|
||||
return all_chunks
|
||||
|
||||
def chunk_file_with_fallback(
|
||||
self,
|
||||
content: str,
|
||||
file_path: str,
|
||||
language: str
|
||||
) -> List[HierarchicalChunk]:
|
||||
"""带降级策略的分词"""
|
||||
|
||||
try:
|
||||
return self.chunk_file(content, file_path, language)
|
||||
except Exception as e:
|
||||
logger.warning(f"Hierarchical chunking failed: {e}, falling back to sliding window")
|
||||
# 降级到滑动窗口策略
|
||||
return self._fallback_sliding_window(content, file_path, language)
|
||||
```
|
||||
|
||||
## 4. 数据存储设计
|
||||
|
||||
### 4.1 数据库Schema
|
||||
|
||||
```sql
|
||||
-- chunk表:存储所有层级的chunk
|
||||
CREATE TABLE chunks (
|
||||
chunk_id TEXT PRIMARY KEY,
|
||||
parent_id TEXT, -- 父chunk ID,NULL表示顶层
|
||||
level INTEGER NOT NULL, -- 1=macro, 2=micro
|
||||
chunk_type TEXT NOT NULL, -- function/class/loop/if/try等
|
||||
file_path TEXT NOT NULL,
|
||||
start_line INTEGER NOT NULL,
|
||||
end_line INTEGER NOT NULL,
|
||||
symbol_name TEXT,
|
||||
content TEXT NOT NULL,
|
||||
content_hash TEXT, -- 用于检测内容变化
|
||||
|
||||
-- 语义元数据(由LLM生成)
|
||||
summary TEXT,
|
||||
keywords TEXT, -- JSON数组
|
||||
purpose TEXT,
|
||||
|
||||
-- 向量嵌入
|
||||
embedding BLOB, -- 存储向量
|
||||
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
FOREIGN KEY (parent_id) REFERENCES chunks(chunk_id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- 索引优化
|
||||
CREATE INDEX idx_chunks_file_path ON chunks(file_path);
|
||||
CREATE INDEX idx_chunks_parent_id ON chunks(parent_id);
|
||||
CREATE INDEX idx_chunks_level ON chunks(level);
|
||||
CREATE INDEX idx_chunks_symbol_name ON chunks(symbol_name);
|
||||
```
|
||||
|
||||
### 4.2 向量索引
|
||||
|
||||
使用分层索引策略:
|
||||
|
||||
```python
|
||||
class HierarchicalVectorStore:
|
||||
"""层级化向量存储"""
|
||||
|
||||
def __init__(self, db_path: Path):
|
||||
self.db_path = db_path
|
||||
self.conn = sqlite3.connect(db_path)
|
||||
|
||||
def add_chunk(self, chunk: HierarchicalChunk):
|
||||
"""添加chunk及其向量"""
|
||||
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("""
|
||||
INSERT INTO chunks (
|
||||
chunk_id, parent_id, level, chunk_type,
|
||||
file_path, start_line, end_line, symbol_name,
|
||||
content, embedding
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
chunk.metadata.chunk_id,
|
||||
chunk.metadata.parent_id,
|
||||
chunk.metadata.level,
|
||||
chunk.metadata.chunk_type,
|
||||
chunk.metadata.file_path,
|
||||
chunk.metadata.start_line,
|
||||
chunk.metadata.end_line,
|
||||
chunk.metadata.symbol_name,
|
||||
chunk.content,
|
||||
self._serialize_embedding(chunk.embedding),
|
||||
))
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
def search_hierarchical(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
top_k: int = 10,
|
||||
level_weights: Dict[int, float] = None
|
||||
) -> List[Tuple[HierarchicalChunk, float]]:
|
||||
"""层级化检索"""
|
||||
|
||||
# 默认权重:macro chunk权重更高
|
||||
if level_weights is None:
|
||||
level_weights = {1: 1.0, 2: 0.8}
|
||||
|
||||
# 检索所有chunk
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("SELECT * FROM chunks WHERE embedding IS NOT NULL")
|
||||
|
||||
results = []
|
||||
for row in cursor.fetchall():
|
||||
chunk = self._row_to_chunk(row)
|
||||
similarity = self._cosine_similarity(
|
||||
query_embedding,
|
||||
chunk.embedding
|
||||
)
|
||||
|
||||
# 根据层级应用权重
|
||||
weighted_score = similarity * level_weights.get(chunk.metadata.level, 1.0)
|
||||
results.append((chunk, weighted_score))
|
||||
|
||||
# 按分数排序
|
||||
results.sort(key=lambda x: x[1], reverse=True)
|
||||
return results[:top_k]
|
||||
|
||||
def get_chunk_with_context(
|
||||
self,
|
||||
chunk_id: str
|
||||
) -> Tuple[HierarchicalChunk, Optional[HierarchicalChunk]]:
|
||||
"""获取chunk及其父chunk(提供上下文)"""
|
||||
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
# 获取chunk本身
|
||||
cursor.execute("SELECT * FROM chunks WHERE chunk_id = ?", (chunk_id,))
|
||||
chunk_row = cursor.fetchone()
|
||||
chunk = self._row_to_chunk(chunk_row)
|
||||
|
||||
# 获取父chunk
|
||||
parent = None
|
||||
if chunk.metadata.parent_id:
|
||||
cursor.execute(
|
||||
"SELECT * FROM chunks WHERE chunk_id = ?",
|
||||
(chunk.metadata.parent_id,)
|
||||
)
|
||||
parent_row = cursor.fetchone()
|
||||
if parent_row:
|
||||
parent = self._row_to_chunk(parent_row)
|
||||
|
||||
return chunk, parent
|
||||
```
|
||||
|
||||
## 5. LLM集成策略
|
||||
|
||||
### 5.1 分层生成语义元数据
|
||||
|
||||
```python
|
||||
class HierarchicalLLMEnhancer:
|
||||
"""为层级chunk生成语义元数据"""
|
||||
|
||||
def enhance_hierarchical_chunks(
|
||||
self,
|
||||
chunks: List[HierarchicalChunk]
|
||||
) -> Dict[str, SemanticMetadata]:
|
||||
"""
|
||||
分层处理策略:
|
||||
1. 先处理所有level=1的macro chunks,生成详细摘要
|
||||
2. 再处理level=2的micro chunks,使用父chunk摘要作为上下文
|
||||
"""
|
||||
|
||||
results = {}
|
||||
|
||||
# 第一轮:处理macro chunks
|
||||
macro_chunks = [c for c in chunks if c.metadata.level == 1]
|
||||
macro_metadata = self.llm_enhancer.enhance_files([
|
||||
FileData(
|
||||
path=c.metadata.chunk_id,
|
||||
content=c.content,
|
||||
language=self._detect_language(c.metadata.file_path)
|
||||
)
|
||||
for c in macro_chunks
|
||||
])
|
||||
results.update(macro_metadata)
|
||||
|
||||
# 第二轮:处理micro chunks(带父上下文)
|
||||
micro_chunks = [c for c in chunks if c.metadata.level == 2]
|
||||
for micro_chunk in micro_chunks:
|
||||
parent_id = micro_chunk.metadata.parent_id
|
||||
parent_summary = macro_metadata.get(parent_id, {}).get('summary', '')
|
||||
|
||||
# 构建带上下文的prompt
|
||||
enhanced_prompt = f"""
|
||||
Parent Function: {micro_chunk.metadata.symbol_name}
|
||||
Parent Summary: {parent_summary}
|
||||
|
||||
Code Block ({micro_chunk.metadata.chunk_type}):
|
||||
```
|
||||
{micro_chunk.content}
|
||||
```
|
||||
|
||||
Generate a concise summary (1 sentence) and keywords for this specific code block.
|
||||
"""
|
||||
|
||||
metadata = self._call_llm_with_context(enhanced_prompt)
|
||||
results[micro_chunk.metadata.chunk_id] = metadata
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### 5.2 Prompt优化
|
||||
|
||||
针对不同层级使用不同的prompt模板:
|
||||
|
||||
**Macro Chunk Prompt (Level 1)**:
|
||||
```
|
||||
PURPOSE: Generate comprehensive semantic metadata for a complete function/class
|
||||
TASK:
|
||||
- Provide a detailed summary (2-3 sentences) covering what the code does and why
|
||||
- Extract 8-12 relevant keywords including technical terms and domain concepts
|
||||
- Identify the primary purpose/category
|
||||
MODE: analysis
|
||||
|
||||
CODE:
|
||||
```{language}
|
||||
{content}
|
||||
```
|
||||
|
||||
OUTPUT: JSON with summary, keywords, purpose
|
||||
```
|
||||
|
||||
**Micro Chunk Prompt (Level 2)**:
|
||||
```
|
||||
PURPOSE: Summarize a specific logic block within a larger function
|
||||
CONTEXT:
|
||||
- Parent Function: {symbol_name}
|
||||
- Parent Purpose: {parent_summary}
|
||||
|
||||
TASK:
|
||||
- Provide a brief summary (1 sentence) of this specific block's role in the parent function
|
||||
- Extract 3-5 keywords specific to this block's logic
|
||||
MODE: analysis
|
||||
|
||||
CODE BLOCK ({chunk_type}):
|
||||
```{language}
|
||||
{content}
|
||||
```
|
||||
|
||||
OUTPUT: JSON with summary, keywords
|
||||
```
|
||||
|
||||
## 6. 检索增强
|
||||
|
||||
### 6.1 上下文扩展检索
|
||||
|
||||
```python
|
||||
class ContextualSearchEngine:
|
||||
"""支持上下文扩展的检索引擎"""
|
||||
|
||||
def search_with_context(
|
||||
self,
|
||||
query: str,
|
||||
top_k: int = 10,
|
||||
expand_context: bool = True
|
||||
) -> List[SearchResult]:
|
||||
"""
|
||||
检索并自动扩展上下文
|
||||
|
||||
如果匹配到micro chunk,自动返回其父macro chunk作为上下文
|
||||
"""
|
||||
|
||||
# 生成查询向量
|
||||
query_embedding = self.embedder.embed_single(query)
|
||||
|
||||
# 层级化检索
|
||||
raw_results = self.vector_store.search_hierarchical(
|
||||
query_embedding,
|
||||
top_k=top_k
|
||||
)
|
||||
|
||||
# 扩展上下文
|
||||
enriched_results = []
|
||||
for chunk, score in raw_results:
|
||||
result = SearchResult(
|
||||
path=chunk.metadata.file_path,
|
||||
score=score,
|
||||
content=chunk.content,
|
||||
start_line=chunk.metadata.start_line,
|
||||
end_line=chunk.metadata.end_line,
|
||||
symbol_name=chunk.metadata.symbol_name,
|
||||
)
|
||||
|
||||
# 如果是micro chunk,获取父chunk作为上下文
|
||||
if expand_context and chunk.metadata.level == 2:
|
||||
parent_chunk, _ = self.vector_store.get_chunk_with_context(
|
||||
chunk.metadata.chunk_id
|
||||
)
|
||||
if parent_chunk:
|
||||
result.metadata['parent_context'] = {
|
||||
'summary': parent_chunk.metadata.context_summary,
|
||||
'symbol_name': parent_chunk.metadata.symbol_name,
|
||||
'content': parent_chunk.content,
|
||||
}
|
||||
|
||||
enriched_results.append(result)
|
||||
|
||||
return enriched_results
|
||||
```
|
||||
|
||||
## 7. 测试策略
|
||||
|
||||
### 7.1 单元测试
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from codexlens.semantic.hierarchical_chunker import (
|
||||
HierarchicalChunker, MacroChunker, MicroChunker
|
||||
)
|
||||
|
||||
class TestMacroChunker:
|
||||
"""测试第一层分词"""
|
||||
|
||||
def test_extract_functions(self):
|
||||
"""测试提取函数定义"""
|
||||
code = '''
|
||||
def calculate_total(items):
|
||||
"""Calculate total price."""
|
||||
total = 0
|
||||
for item in items:
|
||||
total += item.price
|
||||
return total
|
||||
|
||||
def apply_discount(total, discount):
|
||||
"""Apply discount to total."""
|
||||
return total * (1 - discount)
|
||||
'''
|
||||
chunker = MacroChunker()
|
||||
chunks = chunker.chunk_by_symbols(code, 'test.py', 'python')
|
||||
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].metadata.symbol_name == 'calculate_total'
|
||||
assert chunks[1].metadata.symbol_name == 'apply_discount'
|
||||
assert chunks[0].metadata.level == 1
|
||||
|
||||
def test_extract_with_decorators(self):
|
||||
"""测试提取带装饰器的函数"""
|
||||
code = '''
|
||||
@app.route('/api/users')
|
||||
@auth_required
|
||||
def get_users():
|
||||
return User.query.all()
|
||||
'''
|
||||
chunker = MacroChunker()
|
||||
chunks = chunker.chunk_by_symbols(code, 'test.py', 'python')
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert '@app.route' in chunks[0].content
|
||||
assert '@auth_required' in chunks[0].content
|
||||
|
||||
class TestMicroChunker:
|
||||
"""测试第二层分词"""
|
||||
|
||||
def test_extract_loop_blocks(self):
|
||||
"""测试提取循环块"""
|
||||
code = '''
|
||||
def process_items(items):
|
||||
results = []
|
||||
for item in items:
|
||||
if item.active:
|
||||
results.append(process(item))
|
||||
return results
|
||||
'''
|
||||
macro_chunker = MacroChunker()
|
||||
macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python')
|
||||
|
||||
micro_chunker = MicroChunker()
|
||||
micro_chunks = micro_chunker.chunk_logic_blocks(
|
||||
macro_chunks[0], code
|
||||
)
|
||||
|
||||
# 应该提取出for循环和if条件块
|
||||
assert len(micro_chunks) >= 1
|
||||
assert any(c.metadata.chunk_type == 'for_statement' for c in micro_chunks)
|
||||
|
||||
def test_skip_small_functions(self):
|
||||
"""测试小函数跳过二次划分"""
|
||||
code = '''
|
||||
def small_func(x):
|
||||
return x * 2
|
||||
'''
|
||||
macro_chunker = MacroChunker()
|
||||
macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python')
|
||||
|
||||
micro_chunker = MicroChunker()
|
||||
micro_chunks = micro_chunker.chunk_logic_blocks(
|
||||
macro_chunks[0], code, max_lines=10
|
||||
)
|
||||
|
||||
# 小函数不应该被二次划分
|
||||
assert len(micro_chunks) == 0
|
||||
|
||||
class TestHierarchicalChunker:
|
||||
"""测试完整的多层次分词"""
|
||||
|
||||
def test_full_hierarchical_chunking(self):
|
||||
"""测试完整的层级分词流程"""
|
||||
code = '''
|
||||
def complex_function(data):
|
||||
"""A complex function with multiple logic blocks."""
|
||||
|
||||
# Validation
|
||||
if not data:
|
||||
raise ValueError("Data is empty")
|
||||
|
||||
# Processing
|
||||
results = []
|
||||
for item in data:
|
||||
try:
|
||||
processed = process_item(item)
|
||||
results.append(processed)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process: {e}")
|
||||
continue
|
||||
|
||||
# Aggregation
|
||||
total = sum(r.value for r in results)
|
||||
return total
|
||||
'''
|
||||
chunker = HierarchicalChunker()
|
||||
chunks = chunker.chunk_file(code, 'test.py', 'python')
|
||||
|
||||
# 应该有1个macro chunk和多个micro chunks
|
||||
macro_chunks = [c for c in chunks if c.metadata.level == 1]
|
||||
micro_chunks = [c for c in chunks if c.metadata.level == 2]
|
||||
|
||||
assert len(macro_chunks) == 1
|
||||
assert len(micro_chunks) > 0
|
||||
|
||||
# 验证父子关系
|
||||
for micro in micro_chunks:
|
||||
assert micro.metadata.parent_id == macro_chunks[0].metadata.chunk_id
|
||||
```
|
||||
|
||||
### 7.2 集成测试
|
||||
|
||||
```python
|
||||
class TestHierarchicalIndexing:
|
||||
"""测试完整的索引流程"""
|
||||
|
||||
def test_index_and_search(self):
|
||||
"""测试分层索引和检索"""
|
||||
|
||||
# 1. 分词
|
||||
chunker = HierarchicalChunker()
|
||||
chunks = chunker.chunk_file(sample_code, 'sample.py', 'python')
|
||||
|
||||
# 2. LLM增强
|
||||
enhancer = HierarchicalLLMEnhancer()
|
||||
metadata = enhancer.enhance_hierarchical_chunks(chunks)
|
||||
|
||||
# 3. 向量化
|
||||
embedder = Embedder()
|
||||
for chunk in chunks:
|
||||
text = metadata[chunk.metadata.chunk_id].summary
|
||||
chunk.embedding = embedder.embed_single(text)
|
||||
|
||||
# 4. 存储
|
||||
vector_store = HierarchicalVectorStore(Path('/tmp/test.db'))
|
||||
for chunk in chunks:
|
||||
vector_store.add_chunk(chunk)
|
||||
|
||||
# 5. 检索
|
||||
search_engine = ContextualSearchEngine(vector_store, embedder)
|
||||
results = search_engine.search_with_context(
|
||||
"find loop that processes items",
|
||||
top_k=5
|
||||
)
|
||||
|
||||
# 验证结果
|
||||
assert len(results) > 0
|
||||
assert any(r.metadata.get('parent_context') for r in results)
|
||||
```
|
||||
|
||||
## 8. 性能优化
|
||||
|
||||
### 8.1 批量处理
|
||||
|
||||
```python
|
||||
class BatchHierarchicalProcessor:
|
||||
"""批量处理多个文件的层级分词"""
|
||||
|
||||
def process_files_batch(
|
||||
self,
|
||||
file_paths: List[Path],
|
||||
batch_size: int = 10
|
||||
):
|
||||
"""批量处理,优化LLM调用"""
|
||||
|
||||
all_chunks = []
|
||||
|
||||
# 1. 批量分词
|
||||
for file_path in file_paths:
|
||||
content = file_path.read_text()
|
||||
chunks = self.chunker.chunk_file(
|
||||
content, str(file_path), self._detect_language(file_path)
|
||||
)
|
||||
all_chunks.extend(chunks)
|
||||
|
||||
# 2. 批量LLM增强(减少API调用)
|
||||
macro_chunks = [c for c in all_chunks if c.metadata.level == 1]
|
||||
for i in range(0, len(macro_chunks), batch_size):
|
||||
batch = macro_chunks[i:i+batch_size]
|
||||
self.enhancer.enhance_batch(batch)
|
||||
|
||||
# 3. 批量向量化
|
||||
all_texts = [c.content for c in all_chunks]
|
||||
embeddings = self.embedder.embed_batch(all_texts)
|
||||
for chunk, embedding in zip(all_chunks, embeddings):
|
||||
chunk.embedding = embedding
|
||||
|
||||
# 4. 批量存储
|
||||
self.vector_store.add_chunks_batch(all_chunks)
|
||||
```
|
||||
|
||||
### 8.2 增量更新
|
||||
|
||||
```python
|
||||
class IncrementalIndexer:
|
||||
"""增量索引器:只处理变化的文件"""
|
||||
|
||||
def update_file(self, file_path: Path):
|
||||
"""增量更新单个文件"""
|
||||
|
||||
content = file_path.read_text()
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
# 检查文件是否变化
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT content_hash FROM chunks
|
||||
WHERE file_path = ? AND level = 1
|
||||
LIMIT 1
|
||||
""", (str(file_path),))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row and row[0] == content_hash:
|
||||
logger.info(f"File {file_path} unchanged, skipping")
|
||||
return
|
||||
|
||||
# 删除旧chunk
|
||||
cursor.execute("DELETE FROM chunks WHERE file_path = ?", (str(file_path),))
|
||||
|
||||
# 重新索引
|
||||
chunks = self.chunker.chunk_file(content, str(file_path), 'python')
|
||||
# ... 继续处理
|
||||
```
|
||||
|
||||
## 9. 潜在问题与解决方案
|
||||
|
||||
### 9.1 问题:超大函数的micro chunk过多
|
||||
|
||||
**现象**:某些遗留代码函数超过1000行,可能产生几十个micro chunks。
|
||||
|
||||
**解决方案**:
|
||||
```python
|
||||
class AdaptiveMicroChunker:
|
||||
"""自适应micro分词:根据函数大小调整策略"""
|
||||
|
||||
def chunk_logic_blocks(self, macro_chunk, content):
|
||||
total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line
|
||||
|
||||
if total_lines > 500:
|
||||
# 超大函数:只提取顶层逻辑块,不递归
|
||||
return self._extract_top_level_blocks(macro_chunk, content)
|
||||
elif total_lines > 100:
|
||||
# 大函数:递归深度限制为2层
|
||||
return self._extract_blocks_with_depth_limit(macro_chunk, content, max_depth=2)
|
||||
else:
|
||||
# 正常函数:完全跳过micro chunking
|
||||
return []
|
||||
```
|
||||
|
||||
### 9.2 问题:tree-sitter解析失败
|
||||
|
||||
**现象**:对于语法错误的代码,tree-sitter解析可能失败。
|
||||
|
||||
**解决方案**:
|
||||
```python
|
||||
def chunk_file_with_fallback(self, content, file_path, language):
|
||||
"""带降级策略的分词"""
|
||||
|
||||
try:
|
||||
# 尝试层级分词
|
||||
return self.chunk_file(content, file_path, language)
|
||||
except TreeSitterError as e:
|
||||
logger.warning(f"Tree-sitter parsing failed: {e}")
|
||||
|
||||
# 降级到基于正则的简单symbol提取
|
||||
return self._fallback_regex_chunking(content, file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Chunking failed completely: {e}")
|
||||
|
||||
# 最终降级到滑动窗口
|
||||
return self._fallback_sliding_window(content, file_path, language)
|
||||
```
|
||||
|
||||
### 9.3 问题:向量存储空间占用
|
||||
|
||||
**现象**:每个chunk都存储向量,空间占用可能很大。
|
||||
|
||||
**解决方案**:
|
||||
- **选择性向量化**:只对macro chunks和重要的micro chunks生成向量
|
||||
- **向量压缩**:使用PCA或量化技术减少向量维度
|
||||
- **分离存储**:向量存储在专门的向量数据库(如Faiss),SQLite只存元数据
|
||||
|
||||
```python
|
||||
class SelectiveVectorization:
|
||||
"""选择性向量化:减少存储开销"""
|
||||
|
||||
VECTORIZE_CHUNK_TYPES = {
|
||||
'function_definition', # 总是向量化
|
||||
'class_definition', # 总是向量化
|
||||
'for_statement', # 循环块
|
||||
'try_statement', # 异常处理
|
||||
# 'if_statement' 通常不单独向量化,依赖父chunk
|
||||
}
|
||||
|
||||
def should_vectorize(self, chunk: HierarchicalChunk) -> bool:
|
||||
"""判断是否需要为chunk生成向量"""
|
||||
|
||||
# Level 1总是向量化
|
||||
if chunk.metadata.level == 1:
|
||||
return True
|
||||
|
||||
# Level 2根据类型和大小决定
|
||||
if chunk.metadata.chunk_type not in self.VECTORIZE_CHUNK_TYPES:
|
||||
return False
|
||||
|
||||
# 太小的块(<5行)不向量化
|
||||
lines = chunk.metadata.end_line - chunk.metadata.start_line
|
||||
if lines < 5:
|
||||
return False
|
||||
|
||||
return True
|
||||
```
|
||||
|
||||
## 10. 实施路线图
|
||||
|
||||
### Phase 1: 基础架构(2-3周)
|
||||
- [x] 设计数据结构(HierarchicalChunk, ChunkMetadata)
|
||||
- [ ] 实现MacroChunker(复用现有code_extractor)
|
||||
- [ ] 实现基础的MicroChunker
|
||||
- [ ] 数据库schema设计和migration
|
||||
- [ ] 单元测试
|
||||
|
||||
### Phase 2: LLM集成(1-2周)
|
||||
- [ ] 实现HierarchicalLLMEnhancer
|
||||
- [ ] 设计分层prompt模板
|
||||
- [ ] 批量处理优化
|
||||
- [ ] 集成测试
|
||||
|
||||
### Phase 3: 向量化与检索(1-2周)
|
||||
- [ ] 实现HierarchicalVectorStore
|
||||
- [ ] 实现ContextualSearchEngine
|
||||
- [ ] 上下文扩展逻辑
|
||||
- [ ] 检索性能测试
|
||||
|
||||
### Phase 4: 优化与完善(2周)
|
||||
- [ ] 性能优化(批量处理、增量更新)
|
||||
- [ ] 降级策略完善
|
||||
- [ ] 选择性向量化
|
||||
- [ ] 全面测试和文档
|
||||
|
||||
### Phase 5: 生产部署(1周)
|
||||
- [ ] CLI集成
|
||||
- [ ] 配置选项暴露
|
||||
- [ ] 生产环境测试
|
||||
- [ ] 发布
|
||||
|
||||
**总计预估时间**:7-10周
|
||||
|
||||
## 11. 成功指标
|
||||
|
||||
1. **覆盖率**:95%以上的代码能被正确分词
|
||||
2. **准确率**:层级关系准确率>98%
|
||||
3. **检索质量**:相比单层分词,检索相关性提升30%+
|
||||
4. **性能**:单文件分词<100ms,批量处理>100文件/分钟
|
||||
5. **存储效率**:相比全向量化,空间占用减少40%+
|
||||
|
||||
## 12. 参考资料
|
||||
|
||||
- [Tree-sitter Documentation](https://tree-sitter.github.io/)
|
||||
- [AST-based Code Analysis](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
|
||||
- [Hierarchical Text Segmentation](https://arxiv.org/abs/2104.08836)
|
||||
- 现有代码:`src/codexlens/semantic/chunker.py`
|
||||
@@ -1,417 +0,0 @@
|
||||
# Pure Vector Search 使用指南
|
||||
|
||||
## 概述
|
||||
|
||||
CodexLens 现在支持纯向量语义搜索!这是一个重要的新功能,允许您使用自然语言查询代码。
|
||||
|
||||
### 新增搜索模式
|
||||
|
||||
| 模式 | 描述 | 最佳用途 | 需要嵌入 |
|
||||
|------|------|----------|---------|
|
||||
| `exact` | 精确FTS匹配 | 代码标识符搜索 | ✗ |
|
||||
| `fuzzy` | 模糊FTS匹配 | 容错搜索 | ✗ |
|
||||
| `vector` | 向量 + FTS后备 | 语义 + 关键词混合 | ✓ |
|
||||
| **`pure-vector`** | **纯向量搜索** | **纯自然语言查询** | **✓** |
|
||||
| `hybrid` | 全部融合(RRF) | 最佳召回率 | ✓ |
|
||||
|
||||
### 关键变化
|
||||
|
||||
**之前**:
|
||||
```bash
|
||||
# "vector"模式实际上总是包含exact FTS搜索
|
||||
codexlens search "authentication" --mode vector
|
||||
# 即使没有嵌入,也会返回FTS结果
|
||||
```
|
||||
|
||||
**现在**:
|
||||
```bash
|
||||
# "vector"模式仍保持向量+FTS混合(向后兼容)
|
||||
codexlens search "authentication" --mode vector
|
||||
|
||||
# 新的"pure-vector"模式:仅使用向量搜索
|
||||
codexlens search "how to authenticate users" --mode pure-vector
|
||||
# 没有嵌入时返回空列表(明确行为)
|
||||
```
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 步骤1:安装语义搜索依赖
|
||||
|
||||
```bash
|
||||
# 方式1:使用可选依赖
|
||||
pip install codexlens[semantic]
|
||||
|
||||
# 方式2:手动安装
|
||||
pip install fastembed numpy
|
||||
```
|
||||
|
||||
### 步骤2:创建索引(如果还没有)
|
||||
|
||||
```bash
|
||||
# 为项目创建索引
|
||||
codexlens init ~/projects/your-project
|
||||
```
|
||||
|
||||
### 步骤3:生成向量嵌入
|
||||
|
||||
```bash
|
||||
# 为项目生成嵌入(自动查找索引)
|
||||
codexlens embeddings-generate ~/projects/your-project
|
||||
|
||||
# 为特定索引生成嵌入
|
||||
codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db
|
||||
|
||||
# 使用特定模型
|
||||
codexlens embeddings-generate ~/projects/your-project --model fast
|
||||
|
||||
# 强制重新生成
|
||||
codexlens embeddings-generate ~/projects/your-project --force
|
||||
|
||||
# 检查嵌入状态
|
||||
codexlens embeddings-status # 检查所有索引
|
||||
codexlens embeddings-status ~/projects/your-project # 检查特定项目
|
||||
```
|
||||
|
||||
**可用模型**:
|
||||
- `fast`: BAAI/bge-small-en-v1.5 (384维, ~80MB) - 快速,轻量级
|
||||
- `code`: jinaai/jina-embeddings-v2-base-code (768维, ~150MB) - **代码优化**(推荐,默认)
|
||||
- `multilingual`: intfloat/multilingual-e5-large (1024维, ~1GB) - 多语言
|
||||
- `balanced`: mixedbread-ai/mxbai-embed-large-v1 (1024维, ~600MB) - 高精度
|
||||
|
||||
### 步骤4:使用纯向量搜索
|
||||
|
||||
```bash
|
||||
# 纯向量搜索(自然语言)
|
||||
codexlens search "how to verify user credentials" --mode pure-vector
|
||||
|
||||
# 向量搜索(带FTS后备)
|
||||
codexlens search "authentication logic" --mode vector
|
||||
|
||||
# 混合搜索(最佳效果)
|
||||
codexlens search "user login" --mode hybrid
|
||||
|
||||
# 精确代码搜索
|
||||
codexlens search "authenticate_user" --mode exact
|
||||
```
|
||||
|
||||
## 使用场景
|
||||
|
||||
### 场景1:查找实现特定功能的代码
|
||||
|
||||
**问题**:"我如何在这个项目中处理用户身份验证?"
|
||||
|
||||
```bash
|
||||
codexlens search "verify user credentials and authenticate" --mode pure-vector
|
||||
```
|
||||
|
||||
**优势**:理解查询意图,找到语义相关的代码,而不仅仅是关键词匹配。
|
||||
|
||||
### 场景2:查找类似的代码模式
|
||||
|
||||
**问题**:"项目中哪些地方使用了密码哈希?"
|
||||
|
||||
```bash
|
||||
codexlens search "password hashing with salt" --mode pure-vector
|
||||
```
|
||||
|
||||
**优势**:找到即使没有包含"hash"或"password"关键词的相关代码。
|
||||
|
||||
### 场景3:探索性搜索
|
||||
|
||||
**问题**:"如何在这个项目中连接数据库?"
|
||||
|
||||
```bash
|
||||
codexlens search "database connection and initialization" --mode pure-vector
|
||||
```
|
||||
|
||||
**优势**:发现相关代码,即使使用了不同的术语(如"DB"、"connection pool"、"session")。
|
||||
|
||||
### 场景4:混合搜索获得最佳效果
|
||||
|
||||
**问题**:既要关键词匹配,又要语义理解
|
||||
|
||||
```bash
|
||||
# 最佳实践:使用hybrid模式
|
||||
codexlens search "authentication" --mode hybrid
|
||||
```
|
||||
|
||||
**优势**:结合FTS的精确性和向量搜索的语义理解。
|
||||
|
||||
## 故障排除
|
||||
|
||||
### 问题1:纯向量搜索返回空结果
|
||||
|
||||
**原因**:未生成向量嵌入
|
||||
|
||||
**解决方案**:
|
||||
```bash
|
||||
# 检查嵌入状态
|
||||
codexlens embeddings-status ~/projects/your-project
|
||||
|
||||
# 生成嵌入
|
||||
codexlens embeddings-generate ~/projects/your-project
|
||||
|
||||
# 或者对特定索引
|
||||
codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db
|
||||
```
|
||||
|
||||
### 问题2:ImportError: fastembed not found
|
||||
|
||||
**原因**:未安装语义搜索依赖
|
||||
|
||||
**解决方案**:
|
||||
```bash
|
||||
pip install codexlens[semantic]
|
||||
```
|
||||
|
||||
### 问题3:嵌入生成失败
|
||||
|
||||
**原因**:模型下载失败或磁盘空间不足
|
||||
|
||||
**解决方案**:
|
||||
```bash
|
||||
# 使用更小的模型
|
||||
codexlens embeddings-generate ~/projects/your-project --model fast
|
||||
|
||||
# 检查磁盘空间(模型需要~100MB)
|
||||
df -h ~/.cache/fastembed
|
||||
```
|
||||
|
||||
### 问题4:搜索速度慢
|
||||
|
||||
**原因**:向量搜索比FTS慢(需要计算余弦相似度)
|
||||
|
||||
**优化**:
|
||||
- 使用`--limit`限制结果数量
|
||||
- 考虑使用`vector`模式(带FTS后备)而不是`pure-vector`
|
||||
- 对于精确标识符搜索,使用`exact`模式
|
||||
|
||||
## 性能对比
|
||||
|
||||
基于测试数据(100个文件,~500个代码块):
|
||||
|
||||
| 模式 | 平均延迟 | 召回率 | 精确率 |
|
||||
|------|---------|--------|--------|
|
||||
| exact | 5.6ms | 中 | 高 |
|
||||
| fuzzy | 7.7ms | 高 | 中 |
|
||||
| vector | 7.4ms | 高 | 中 |
|
||||
| **pure-vector** | **7.0ms** | **最高** | **中** |
|
||||
| hybrid | 9.0ms | 最高 | 高 |
|
||||
|
||||
**结论**:
|
||||
- `exact`: 最快,适合代码标识符
|
||||
- `pure-vector`: 与vector类似速度,更明确的语义搜索
|
||||
- `hybrid`: 轻微开销,但召回率和精确率最佳
|
||||
|
||||
## 最佳实践
|
||||
|
||||
### 1. 选择合适的搜索模式
|
||||
|
||||
```bash
|
||||
# 查找函数名/类名/变量名 → exact
|
||||
codexlens search "UserAuthentication" --mode exact
|
||||
|
||||
# 自然语言问题 → pure-vector
|
||||
codexlens search "how to hash passwords securely" --mode pure-vector
|
||||
|
||||
# 不确定用哪个 → hybrid
|
||||
codexlens search "password security" --mode hybrid
|
||||
```
|
||||
|
||||
### 2. 优化查询
|
||||
|
||||
**不好的查询**(对向量搜索):
|
||||
```bash
|
||||
codexlens search "auth" --mode pure-vector # 太模糊
|
||||
```
|
||||
|
||||
**好的查询**:
|
||||
```bash
|
||||
codexlens search "authenticate user with username and password" --mode pure-vector
|
||||
```
|
||||
|
||||
**原则**:
|
||||
- 使用完整句子描述意图
|
||||
- 包含关键动词和名词
|
||||
- 避免过于简短或模糊的查询
|
||||
|
||||
### 3. 定期更新嵌入
|
||||
|
||||
```bash
|
||||
# 当代码更新后,重新生成嵌入
|
||||
codexlens embeddings-generate ~/projects/your-project --force
|
||||
```
|
||||
|
||||
### 4. 监控嵌入存储空间
|
||||
|
||||
```bash
|
||||
# 检查嵌入数据大小
|
||||
du -sh ~/.codexlens/indexes/*/
|
||||
|
||||
# 嵌入通常占用索引大小的2-3倍
|
||||
# 100个文件 → ~500个chunks → ~1.5MB (768维向量)
|
||||
```
|
||||
|
||||
## API 使用示例
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
|
||||
# 初始化引擎
|
||||
engine = HybridSearchEngine()
|
||||
|
||||
# 纯向量搜索
|
||||
results = engine.search(
|
||||
index_path=Path("~/.codexlens/indexes/project/_index.db"),
|
||||
query="how to authenticate users",
|
||||
limit=10,
|
||||
enable_vector=True,
|
||||
pure_vector=True, # 纯向量模式
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"{result.path}: {result.score:.3f}")
|
||||
print(f" {result.excerpt}")
|
||||
|
||||
# 向量搜索(带FTS后备)
|
||||
results = engine.search(
|
||||
index_path=Path("~/.codexlens/indexes/project/_index.db"),
|
||||
query="authentication",
|
||||
limit=10,
|
||||
enable_vector=True,
|
||||
pure_vector=False, # 允许FTS后备
|
||||
)
|
||||
```
|
||||
|
||||
### 链式搜索API
|
||||
|
||||
```python
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
# 初始化
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
# 配置搜索选项
|
||||
options = SearchOptions(
|
||||
depth=-1, # 无限深度
|
||||
total_limit=20,
|
||||
hybrid_mode=True,
|
||||
enable_vector=True,
|
||||
pure_vector=True, # 纯向量搜索
|
||||
)
|
||||
|
||||
# 执行搜索
|
||||
result = engine.search(
|
||||
query="verify user credentials",
|
||||
source_path=Path("~/projects/my-app"),
|
||||
options=options
|
||||
)
|
||||
|
||||
print(f"Found {len(result.results)} results in {result.stats.time_ms:.1f}ms")
|
||||
```
|
||||
|
||||
## 技术细节
|
||||
|
||||
### 向量存储架构
|
||||
|
||||
```
|
||||
_index.db (SQLite)
|
||||
├── files # 文件索引表
|
||||
├── files_fts # FTS5全文索引
|
||||
├── files_fts_fuzzy # 模糊搜索索引
|
||||
└── semantic_chunks # 向量嵌入表 ✓ 新增
|
||||
├── id
|
||||
├── file_path
|
||||
├── content # 代码块内容
|
||||
├── embedding # 向量嵌入(BLOB, float32)
|
||||
├── metadata # JSON元数据
|
||||
└── created_at
|
||||
```
|
||||
|
||||
### 向量搜索流程
|
||||
|
||||
```
|
||||
1. 查询嵌入化
|
||||
└─ query → Embedder → query_embedding (768维向量)
|
||||
|
||||
2. 相似度计算
|
||||
└─ VectorStore.search_similar()
|
||||
├─ 加载embedding matrix到内存
|
||||
├─ NumPy向量化余弦相似度计算
|
||||
└─ Top-K选择
|
||||
|
||||
3. 结果返回
|
||||
└─ SearchResult对象列表
|
||||
├─ path: 文件路径
|
||||
├─ score: 相似度分数
|
||||
├─ excerpt: 代码片段
|
||||
└─ metadata: 元数据
|
||||
```
|
||||
|
||||
### RRF融合算法
|
||||
|
||||
混合模式使用Reciprocal Rank Fusion (RRF):
|
||||
|
||||
```python
|
||||
# 默认权重
|
||||
weights = {
|
||||
"exact": 0.4, # 40% 精确FTS
|
||||
"fuzzy": 0.3, # 30% 模糊FTS
|
||||
"vector": 0.3, # 30% 向量搜索
|
||||
}
|
||||
|
||||
# RRF公式
|
||||
score(doc) = Σ weight[source] / (k + rank[source])
|
||||
k = 60 # RRF常数
|
||||
```
|
||||
|
||||
## 未来改进
|
||||
|
||||
- [ ] 增量嵌入更新(当前需要完全重新生成)
|
||||
- [ ] 混合分块策略(symbol-based + sliding window)
|
||||
- [ ] FAISS加速(100x+速度提升)
|
||||
- [ ] 向量压缩(减少50%存储空间)
|
||||
- [ ] 查询扩展(同义词、相关术语)
|
||||
- [ ] 多模态搜索(代码 + 文档 + 注释)
|
||||
|
||||
## 相关资源
|
||||
|
||||
- **实现文件**:
|
||||
- `codexlens/search/hybrid_search.py` - 混合搜索引擎
|
||||
- `codexlens/semantic/embedder.py` - 嵌入生成
|
||||
- `codexlens/semantic/vector_store.py` - 向量存储
|
||||
- `codexlens/semantic/chunker.py` - 代码分块
|
||||
|
||||
- **测试文件**:
|
||||
- `tests/test_pure_vector_search.py` - 纯向量搜索测试
|
||||
- `tests/test_search_comparison.py` - 搜索模式对比
|
||||
|
||||
- **文档**:
|
||||
- `SEARCH_COMPARISON_ANALYSIS.md` - 详细技术分析
|
||||
- `SEARCH_ANALYSIS_SUMMARY.md` - 快速总结
|
||||
|
||||
## 反馈和贡献
|
||||
|
||||
如果您发现问题或有改进建议,请提交issue或PR:
|
||||
- GitHub: https://github.com/your-org/codexlens
|
||||
|
||||
## 更新日志
|
||||
|
||||
### v0.5.0 (2025-12-16)
|
||||
- ✨ 新增 `pure-vector` 搜索模式
|
||||
- ✨ 添加向量嵌入生成脚本
|
||||
- 🔧 修复"vector"模式总是包含exact FTS的问题
|
||||
- 📚 更新文档和使用指南
|
||||
- ✅ 添加纯向量搜索测试套件
|
||||
|
||||
---
|
||||
|
||||
**问题?** 查看 [故障排除](#故障排除) 章节或提交issue。
|
||||
@@ -1,825 +0,0 @@
|
||||
# CodexLens Real LSP Server Implementation Plan
|
||||
|
||||
> **Version**: 2.0
|
||||
> **Status**: Ready for Implementation
|
||||
> **Based on**: Existing LSP_INTEGRATION_PLAN.md + Real Language Server Integration
|
||||
> **Goal**: Implement true LSP server functionality (like cclsp), not pre-indexed search
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
### Current State vs Target State
|
||||
|
||||
| Aspect | Current (Pre-indexed) | Target (Real LSP) |
|
||||
|--------|----------------------|-------------------|
|
||||
| **Data Source** | Cached database index | Live language servers |
|
||||
| **Freshness** | Stale (depends on re-index) | Real-time (LSP protocol) |
|
||||
| **Accuracy** | Good for indexed content | Perfect (from language server) |
|
||||
| **Latency** | <50ms (database) | ~50-200ms (LSP) |
|
||||
| **Language Support** | Limited to parsed symbols | Full LSP support (all languages) |
|
||||
| **Complexity** | Simple (DB queries) | High (LSP protocol + server mgmt) |
|
||||
|
||||
### Why Real LSP vs Index-Based
|
||||
|
||||
**Problem with current approach**:
|
||||
- 符号搜索与smart_search没有本质区别
|
||||
- 依赖预索引数据,不能实时反映代码变化
|
||||
- 不支持advanced LSP功能(rename, code actions等)
|
||||
|
||||
**Advantages of real LSP**:
|
||||
- ✅ Real-time code intelligence
|
||||
- ✅ Supported by all major IDEs (VSCode, Neovim, Sublime, etc.)
|
||||
- ✅ Standard protocol (Language Server Protocol)
|
||||
- ✅ Advanced features: rename, code actions, formatting
|
||||
- ✅ Language-agnostic (TypeScript, Python, Go, Rust, Java, etc.)
|
||||
|
||||
---
|
||||
|
||||
## Architecture Design
|
||||
|
||||
### System Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Client Layer │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ VS Code │ │ Neovim │ │ Sublime │ │
|
||||
│ │ (LSP Client) │ │ (LSP Client) │ │ (LSP Client) │ │
|
||||
│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
|
||||
│ │ │ │ │
|
||||
└─────────┼─────────────────┼─────────────────┼───────────┘
|
||||
│ LSP Protocol │ │
|
||||
│ (JSON-RPC/stdio)│ │
|
||||
┌─────────▼─────────────────▼─────────────────▼───────────┐
|
||||
│ CodexLens LSP Server Bridge │
|
||||
│ ┌─────────────────────────────────────────────────────┐ │
|
||||
│ │ LSP Protocol Handler (pygls) │ │
|
||||
│ │ • initialize / shutdown │ │
|
||||
│ │ • textDocument/definition │ │
|
||||
│ │ • textDocument/references │ │
|
||||
│ │ • textDocument/hover │ │
|
||||
│ │ • textDocument/completion │ │
|
||||
│ │ • textDocument/formatting │ │
|
||||
│ │ • workspace/symbol │ │
|
||||
│ └────────────────────┬────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────────────────▼────────────────────────────────┐ │
|
||||
│ │ Language Server Multiplexer │ │
|
||||
│ │ • File type routing (ts→tsserver, py→pylsp, etc.) │ │
|
||||
│ │ • Multi-server management │ │
|
||||
│ │ • Request forwarding & response formatting │ │
|
||||
│ └────────────────────┬────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────────────────▼────────────────────────────────┐ │
|
||||
│ │ Language Servers (Spawned) │ │
|
||||
│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │
|
||||
│ │ │tsserver│ │ pylsp │ │ gopls │ │rust- │ │ │
|
||||
│ │ │ │ │ │ │ │ │analyzer│ │ │
|
||||
│ │ └────────┘ └────────┘ └────────┘ └────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────┐ │
|
||||
│ │ Codex-Lens Core (Optional - MCP Layer) │ │
|
||||
│ │ • Semantic search │ │
|
||||
│ │ • Custom MCP tools (enrich_prompt, etc.) │ │
|
||||
│ │ • Hook system (pre-tool, post-tool) │ │
|
||||
│ └─────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Key Differences from Index-Based Approach
|
||||
|
||||
1. **Request Flow**
|
||||
- Index: Query → Database → Results
|
||||
- LSP: Request → Route to LS → LS processes live code → Results
|
||||
|
||||
2. **Configuration**
|
||||
- Index: Depends on indexing state
|
||||
- LSP: Depends on installed language servers
|
||||
|
||||
3. **Latency Profile**
|
||||
- Index: Consistent (~50ms)
|
||||
- LSP: Variable (50-500ms depending on LS performance)
|
||||
|
||||
---
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: LSP Server Bridge (Foundation)
|
||||
|
||||
**Duration**: ~3-5 days
|
||||
**Complexity**: Medium
|
||||
**Dependencies**: pygls library
|
||||
|
||||
#### 1.1 Setup & Dependencies
|
||||
|
||||
**File**: `pyproject.toml`
|
||||
|
||||
```toml
|
||||
[project.optional-dependencies]
|
||||
lsp = [
|
||||
"pygls>=1.3.0",
|
||||
"lsprotocol>=2023.0.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
codexlens-lsp = "codexlens.lsp.server:main"
|
||||
```
|
||||
|
||||
**Installation**:
|
||||
```bash
|
||||
pip install -e ".[lsp]"
|
||||
```
|
||||
|
||||
#### 1.2 LSP Server Core
|
||||
|
||||
**Files to create**:
|
||||
1. `src/codexlens/lsp/__init__.py` - Package init
|
||||
2. `src/codexlens/lsp/server.py` - Server entry point
|
||||
3. `src/codexlens/lsp/multiplexer.py` - LS routing & management
|
||||
4. `src/codexlens/lsp/handlers.py` - LSP request handlers
|
||||
|
||||
**Key responsibilities**:
|
||||
- Initialize LSP server via pygls
|
||||
- Handle client capabilities negotiation
|
||||
- Route requests to appropriate language servers
|
||||
- Format language server responses to LSP format
|
||||
|
||||
#### 1.3 Acceptance Criteria
|
||||
|
||||
- [ ] Server starts with `codexlens-lsp --stdio`
|
||||
- [ ] Responds to `initialize` request
|
||||
- [ ] Spawns language servers on demand
|
||||
- [ ] Handles `shutdown` cleanly
|
||||
- [ ] No crashes on malformed requests
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Language Server Multiplexer
|
||||
|
||||
**Duration**: ~5-7 days
|
||||
**Complexity**: High
|
||||
**Dependencies**: Phase 1 complete
|
||||
|
||||
#### 2.1 Multi-Server Management
|
||||
|
||||
**File**: `src/codexlens/lsp/multiplexer.py`
|
||||
|
||||
**Responsibilities**:
|
||||
- Spawn language servers based on file extension
|
||||
- Maintain server process lifecycle
|
||||
- Route requests by document type
|
||||
- Handle server crashes & restarts
|
||||
|
||||
**Supported Language Servers**:
|
||||
|
||||
| Language | Server | Installation |
|
||||
|----------|--------|--------------|
|
||||
| TypeScript | `typescript-language-server` | `npm i -g typescript-language-server` |
|
||||
| Python | `pylsp` | `pip install python-lsp-server` |
|
||||
| Go | `gopls` | `go install golang.org/x/tools/gopls@latest` |
|
||||
| Rust | `rust-analyzer` | `rustup component add rust-analyzer` |
|
||||
| Java | `jdtls` | Download JDTLS |
|
||||
| C/C++ | `clangd` | `apt install clangd` |
|
||||
|
||||
#### 2.2 Configuration
|
||||
|
||||
**File**: `codexlens-lsp.json` (user config)
|
||||
|
||||
```json
|
||||
{
|
||||
"languageServers": {
|
||||
"typescript": {
|
||||
"command": ["typescript-language-server", "--stdio"],
|
||||
"extensions": ["ts", "tsx", "js", "jsx"],
|
||||
"rootDir": "."
|
||||
},
|
||||
"python": {
|
||||
"command": ["pylsp"],
|
||||
"extensions": ["py", "pyi"],
|
||||
"rootDir": ".",
|
||||
"settings": {
|
||||
"pylsp": {
|
||||
"plugins": {
|
||||
"pycodestyle": { "enabled": true },
|
||||
"pylint": { "enabled": false }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"go": {
|
||||
"command": ["gopls"],
|
||||
"extensions": ["go"],
|
||||
"rootDir": "."
|
||||
},
|
||||
"rust": {
|
||||
"command": ["rust-analyzer"],
|
||||
"extensions": ["rs"],
|
||||
"rootDir": "."
|
||||
}
|
||||
},
|
||||
"debug": false,
|
||||
"logLevel": "info"
|
||||
}
|
||||
```
|
||||
|
||||
#### 2.3 Acceptance Criteria
|
||||
|
||||
- [ ] Routes requests to correct LS based on file type
|
||||
- [ ] Spawns servers on first request
|
||||
- [ ] Reuses existing server instances
|
||||
- [ ] Handles server restarts on crash
|
||||
- [ ] Respects initialization options from config
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: Core LSP Handlers
|
||||
|
||||
**Duration**: ~5-7 days
|
||||
**Complexity**: Medium
|
||||
**Dependencies**: Phase 1-2 complete
|
||||
|
||||
#### 3.1 Essential Handlers
|
||||
|
||||
Implement LSP request handlers for core functionality:
|
||||
|
||||
**Handler Mapping**:
|
||||
|
||||
```python
|
||||
Handlers = {
|
||||
# Navigation
|
||||
"textDocument/definition": handle_definition,
|
||||
"textDocument/references": handle_references,
|
||||
"textDocument/declaration": handle_declaration,
|
||||
|
||||
# Hover & Info
|
||||
"textDocument/hover": handle_hover,
|
||||
"textDocument/signatureHelp": handle_signature_help,
|
||||
|
||||
# Completion
|
||||
"textDocument/completion": handle_completion,
|
||||
"completionItem/resolve": handle_completion_resolve,
|
||||
|
||||
# Symbols
|
||||
"textDocument/documentSymbol": handle_document_symbols,
|
||||
"workspace/symbol": handle_workspace_symbols,
|
||||
|
||||
# Editing
|
||||
"textDocument/formatting": handle_formatting,
|
||||
"textDocument/rangeFormatting": handle_range_formatting,
|
||||
"textDocument/rename": handle_rename,
|
||||
|
||||
# Diagnostics
|
||||
"textDocument/publishDiagnostics": handle_publish_diagnostics,
|
||||
|
||||
# Misc
|
||||
"textDocument/codeAction": handle_code_action,
|
||||
"textDocument/codeLens": handle_code_lens,
|
||||
}
|
||||
```
|
||||
|
||||
#### 3.2 Request Forwarding Logic
|
||||
|
||||
```python
|
||||
def forward_request_to_lsp(handler_name, params):
|
||||
"""Forward request to appropriate language server."""
|
||||
|
||||
# Extract document info
|
||||
document_uri = params.get("textDocument", {}).get("uri")
|
||||
file_ext = extract_extension(document_uri)
|
||||
|
||||
# Get language server
|
||||
ls = multiplexer.get_server(file_ext)
|
||||
if not ls:
|
||||
return {"error": f"No LS for {file_ext}"}
|
||||
|
||||
# Convert position (1-based → 0-based)
|
||||
normalized_params = normalize_positions(params)
|
||||
|
||||
# Forward to LS
|
||||
response = ls.send_request(handler_name, normalized_params)
|
||||
|
||||
# Convert response format
|
||||
return normalize_response(response)
|
||||
```
|
||||
|
||||
#### 3.3 Acceptance Criteria
|
||||
|
||||
- [ ] All handlers implemented and tested
|
||||
- [ ] Proper position coordinate conversion (LSP is 0-based, user-facing is 1-based)
|
||||
- [ ] Error handling for missing language servers
|
||||
- [ ] Response formatting matches LSP spec
|
||||
- [ ] Latency < 500ms for 95th percentile
|
||||
|
||||
---
|
||||
|
||||
### Phase 4: Advanced Features
|
||||
|
||||
**Duration**: ~3-5 days
|
||||
**Complexity**: Medium
|
||||
**Dependencies**: Phase 1-3 complete
|
||||
|
||||
#### 4.1 Position Tolerance (cclsp-like feature)
|
||||
|
||||
Some LSP clients (like Claude Code with fuzzy positions) may send imprecise positions. Implement retry logic:
|
||||
|
||||
```python
|
||||
def find_symbol_with_tolerance(ls, uri, position, max_attempts=5):
|
||||
"""Try multiple position offsets if exact position fails."""
|
||||
|
||||
positions_to_try = [
|
||||
position, # Original
|
||||
(position.line - 1, position.char), # One line up
|
||||
(position.line + 1, position.char), # One line down
|
||||
(position.line, max(0, position.char - 1)), # One char left
|
||||
(position.line, position.char + 1), # One char right
|
||||
]
|
||||
|
||||
for pos in positions_to_try:
|
||||
try:
|
||||
result = ls.send_request("textDocument/definition", {
|
||||
"textDocument": {"uri": uri},
|
||||
"position": pos
|
||||
})
|
||||
if result:
|
||||
return result
|
||||
except:
|
||||
continue
|
||||
|
||||
return None
|
||||
```
|
||||
|
||||
#### 4.2 MCP Integration (Optional)
|
||||
|
||||
Extend with MCP provider for Claude Code hooks:
|
||||
|
||||
```python
|
||||
class MCPBridgeHandler:
|
||||
"""Bridge LSP results into MCP context."""
|
||||
|
||||
def build_mcp_context_from_lsp(self, symbol_name, lsp_results):
|
||||
"""Convert LSP responses to MCP context."""
|
||||
# Implementation
|
||||
pass
|
||||
```
|
||||
|
||||
#### 4.3 Acceptance Criteria
|
||||
|
||||
- [ ] Position tolerance working (≥3 positions tried)
|
||||
- [ ] MCP context generation functional
|
||||
- [ ] Hook system integration complete
|
||||
- [ ] All test coverage > 80%
|
||||
|
||||
---
|
||||
|
||||
### Phase 5: Deployment & Documentation
|
||||
|
||||
**Duration**: ~2-3 days
|
||||
**Complexity**: Low
|
||||
**Dependencies**: Phase 1-4 complete
|
||||
|
||||
#### 5.1 Installation & Setup Guide
|
||||
|
||||
Create comprehensive documentation:
|
||||
- Installation instructions for each supported language
|
||||
- Configuration guide
|
||||
- Troubleshooting
|
||||
- Performance tuning
|
||||
|
||||
#### 5.2 CLI Tools
|
||||
|
||||
```bash
|
||||
# Start LSP server
|
||||
codexlens-lsp --stdio
|
||||
|
||||
# Check configured language servers
|
||||
codexlens-lsp --list-servers
|
||||
|
||||
# Validate configuration
|
||||
codexlens-lsp --validate-config
|
||||
|
||||
# Show logs
|
||||
codexlens-lsp --log-level debug
|
||||
```
|
||||
|
||||
#### 5.3 Acceptance Criteria
|
||||
|
||||
- [ ] Documentation complete with examples
|
||||
- [ ] All CLI commands working
|
||||
- [ ] Integration tested with VS Code, Neovim
|
||||
- [ ] Performance benchmarks documented
|
||||
|
||||
---
|
||||
|
||||
## Module Structure
|
||||
|
||||
```
|
||||
src/codexlens/lsp/
|
||||
├── __init__.py # Package exports
|
||||
├── server.py # LSP server entry point
|
||||
├── multiplexer.py # Language server manager
|
||||
├── handlers.py # LSP request handlers
|
||||
├── position_utils.py # Coordinate conversion utilities
|
||||
├── process_manager.py # Language server process lifecycle
|
||||
├── response_formatter.py # LSP response formatting
|
||||
└── config.py # Configuration loading
|
||||
|
||||
tests/lsp/
|
||||
├── test_multiplexer.py # LS routing tests
|
||||
├── test_handlers.py # Handler tests
|
||||
├── test_position_conversion.py # Coordinate tests
|
||||
├── test_integration.py # Full LSP handshake
|
||||
└── fixtures/
|
||||
├── sample_python.py # Test files
|
||||
└── sample_typescript.ts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependency Graph
|
||||
|
||||
```
|
||||
Phase 5 (Deployment)
|
||||
↑
|
||||
Phase 4 (Advanced Features)
|
||||
↑
|
||||
Phase 3 (Core Handlers)
|
||||
├─ Depends on: Phase 2
|
||||
├─ Depends on: Phase 1
|
||||
└─ Deliverable: Full LSP functionality
|
||||
|
||||
Phase 2 (Multiplexer)
|
||||
├─ Depends on: Phase 1
|
||||
└─ Deliverable: Multi-server routing
|
||||
|
||||
Phase 1 (Server Bridge)
|
||||
└─ Deliverable: Basic LSP server
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technology Stack
|
||||
|
||||
| Component | Technology | Rationale |
|
||||
|-----------|-----------|-----------|
|
||||
| LSP Implementation | `pygls` | Mature, well-maintained |
|
||||
| Protocol | LSP 3.17+ | Latest stable version |
|
||||
| Process Management | `subprocess` + `psutil` | Standard Python, no external deps |
|
||||
| Configuration | JSON | Simple, widely understood |
|
||||
| Logging | `logging` module | Built-in, standard |
|
||||
| Testing | `pytest` + `pytest-asyncio` | Industry standard |
|
||||
|
||||
---
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
| Risk | Probability | Impact | Mitigation |
|
||||
|------|-------------|--------|------------|
|
||||
| Language server crashes | Medium | High | Auto-restart with exponential backoff |
|
||||
| Configuration errors | Medium | Medium | Validation on startup |
|
||||
| Performance degradation | Low | High | Implement caching + benchmarks |
|
||||
| Position mismatch issues | Medium | Low | Tolerance layer (try multiple positions) |
|
||||
| Memory leaks (long sessions) | Low | Medium | Connection pooling + cleanup timers |
|
||||
|
||||
---
|
||||
|
||||
## Success Metrics
|
||||
|
||||
1. **Functionality**: All 7 core LSP handlers working
|
||||
2. **Performance**: p95 latency < 500ms for typical requests
|
||||
3. **Reliability**: 99.9% uptime in production
|
||||
4. **Coverage**: >80% code coverage
|
||||
5. **Documentation**: Complete with examples
|
||||
6. **Multi-language**: Support for 5+ languages
|
||||
|
||||
---
|
||||
|
||||
## Comparison: This Approach vs Alternatives
|
||||
|
||||
### Option A: Real LSP Server (This Plan) ✅ RECOMMENDED
|
||||
**Pros**:
|
||||
- ✅ True real-time code intelligence
|
||||
- ✅ Supports all LSP clients (VSCode, Neovim, Sublime, Emacs, etc.)
|
||||
- ✅ Advanced features (rename, code actions, formatting)
|
||||
- ✅ Language-agnostic
|
||||
- ✅ Follows industry standard protocol
|
||||
|
||||
**Cons**:
|
||||
- ❌ More complex implementation
|
||||
- ❌ Depends on external language servers
|
||||
- ❌ Higher latency than index-based
|
||||
|
||||
**Effort**: ~20-25 days
|
||||
|
||||
---
|
||||
|
||||
### Option B: Enhanced Index-Based (Current Approach)
|
||||
**Pros**:
|
||||
- ✅ Simple implementation
|
||||
- ✅ Fast (<50ms)
|
||||
- ✅ No external dependencies
|
||||
|
||||
**Cons**:
|
||||
- ❌ Same as smart_search (user's concern)
|
||||
- ❌ Stale data between re-indexes
|
||||
- ❌ Limited to indexed symbols
|
||||
- ❌ No advanced LSP features
|
||||
|
||||
**Effort**: ~5-10 days
|
||||
|
||||
---
|
||||
|
||||
### Option C: Hybrid (LSP + Index)
|
||||
**Pros**:
|
||||
- ✅ Real-time from LSP
|
||||
- ✅ Fallback to index
|
||||
- ✅ Best of both worlds
|
||||
|
||||
**Cons**:
|
||||
- ❌ Highest complexity
|
||||
- ❌ Difficult to debug conflicts
|
||||
- ❌ Higher maintenance burden
|
||||
|
||||
**Effort**: ~30-35 days
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Approve Plan**: Confirm this approach matches requirements
|
||||
2. **Setup Dev Environment**: Install language servers
|
||||
3. **Phase 1 Implementation**: Start with server bridge
|
||||
4. **Iterative Testing**: Test each phase with real IDE integration
|
||||
5. **Documentation**: Maintain docs as implementation progresses
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## Appendix A: VSCode Bridge Implementation
|
||||
|
||||
### A.1 Overview
|
||||
|
||||
VSCode Bridge 是另一种集成方式,通过VSCode扩展暴露其内置LSP功能给外部工具(如CCW MCP Server)。
|
||||
|
||||
**Architecture**:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Claude Code / CCW │
|
||||
│ (MCP Client / CLI) │
|
||||
└───────────────────────────┬─────────────────────────────────────┘
|
||||
│
|
||||
│ MCP Tool Call (vscode_lsp)
|
||||
│
|
||||
┌───────────────────────────▼─────────────────────────────────────┐
|
||||
│ CCW MCP Server │
|
||||
│ ┌─────────────────────────────────────────────────────────────┐ │
|
||||
│ │ vscode_lsp Tool │ │
|
||||
│ │ • HTTP client to VSCode Bridge │ │
|
||||
│ │ • Parameter validation (Zod) │ │
|
||||
│ │ • Response formatting │ │
|
||||
│ └────────────────────────┬────────────────────────────────────┘ │
|
||||
└───────────────────────────┼─────────────────────────────────────┘
|
||||
│
|
||||
│ HTTP POST (localhost:3457)
|
||||
│
|
||||
┌───────────────────────────▼─────────────────────────────────────┐
|
||||
│ ccw-vscode-bridge Extension │
|
||||
│ ┌─────────────────────────────────────────────────────────────┐ │
|
||||
│ │ HTTP Server (port 3457) │ │
|
||||
│ │ Endpoints: │ │
|
||||
│ │ • POST /get_definition │ │
|
||||
│ │ • POST /get_references │ │
|
||||
│ │ • POST /get_hover │ │
|
||||
│ │ • POST /get_document_symbols │ │
|
||||
│ └────────────────────────┬────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────────────────────▼────────────────────────────────────┐ │
|
||||
│ │ VSCode API Calls │ │
|
||||
│ │ vscode.commands.executeCommand(): │ │
|
||||
│ │ • vscode.executeDefinitionProvider │ │
|
||||
│ │ • vscode.executeReferenceProvider │ │
|
||||
│ │ • vscode.executeHoverProvider │ │
|
||||
│ │ • vscode.executeDocumentSymbolProvider │ │
|
||||
│ └─────────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
│ VSCode LSP Integration
|
||||
│
|
||||
┌───────────────────────────▼─────────────────────────────────────┐
|
||||
│ VSCode Language Services │
|
||||
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
|
||||
│ │TypeScript│ │ Python │ │ Go │ │ Rust │ │
|
||||
│ │ Server │ │ Server │ │ (gopls) │ │Analyzer │ │
|
||||
│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### A.2 Component Files
|
||||
|
||||
**已创建的文件**:
|
||||
|
||||
1. `ccw-vscode-bridge/package.json` - VSCode扩展配置
|
||||
2. `ccw-vscode-bridge/tsconfig.json` - TypeScript配置
|
||||
3. `ccw-vscode-bridge/src/extension.ts` - 扩展主代码
|
||||
4. `ccw-vscode-bridge/.vscodeignore` - 打包排除文件
|
||||
5. `ccw-vscode-bridge/README.md` - 使用文档
|
||||
|
||||
**待创建的文件**:
|
||||
|
||||
1. `ccw/src/tools/vscode-lsp.ts` - MCP工具实现
|
||||
2. `ccw/src/tools/index.ts` - 注册新工具
|
||||
|
||||
### A.3 VSCode Bridge Extension Implementation
|
||||
|
||||
**File**: `ccw-vscode-bridge/src/extension.ts`
|
||||
|
||||
```typescript
|
||||
// 核心功能:
|
||||
// 1. 启动HTTP服务器监听3457端口
|
||||
// 2. 接收POST请求,解析JSON body
|
||||
// 3. 调用VSCode内置LSP命令
|
||||
// 4. 返回JSON结果
|
||||
|
||||
// HTTP Endpoints:
|
||||
// POST /get_definition → vscode.executeDefinitionProvider
|
||||
// POST /get_references → vscode.executeReferenceProvider
|
||||
// POST /get_hover → vscode.executeHoverProvider
|
||||
// POST /get_document_symbols → vscode.executeDocumentSymbolProvider
|
||||
```
|
||||
|
||||
### A.4 MCP Tool Implementation
|
||||
|
||||
**File**: `ccw/src/tools/vscode-lsp.ts`
|
||||
|
||||
```typescript
|
||||
/**
|
||||
* MCP tool that communicates with VSCode Bridge extension.
|
||||
*
|
||||
* Actions:
|
||||
* - get_definition: Find symbol definition
|
||||
* - get_references: Find all references
|
||||
* - get_hover: Get hover information
|
||||
* - get_document_symbols: List symbols in file
|
||||
*
|
||||
* Required:
|
||||
* - ccw-vscode-bridge extension running in VSCode
|
||||
* - File must be open in VSCode for accurate results
|
||||
*/
|
||||
|
||||
const schema: ToolSchema = {
|
||||
name: 'vscode_lsp',
|
||||
description: `Access live VSCode LSP features...`,
|
||||
inputSchema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
action: { type: 'string', enum: [...] },
|
||||
file_path: { type: 'string' },
|
||||
line: { type: 'number' },
|
||||
character: { type: 'number' }
|
||||
},
|
||||
required: ['action', 'file_path']
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
### A.5 Advantages vs Standalone LSP Server
|
||||
|
||||
| Feature | VSCode Bridge | Standalone LSP Server |
|
||||
|---------|--------------|----------------------|
|
||||
| **Setup Complexity** | Low (VSCode ext) | Medium (multiple LS) |
|
||||
| **Language Support** | Automatic (VSCode) | Manual config |
|
||||
| **Maintenance** | Low | Medium |
|
||||
| **IDE Independence** | VSCode only | Any LSP client |
|
||||
| **Performance** | Good | Good |
|
||||
| **Advanced Features** | Full VSCode support | LSP standard |
|
||||
|
||||
---
|
||||
|
||||
## Appendix B: Complete Integration Architecture
|
||||
|
||||
### B.1 Three Integration Paths
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ CodexLens Integration Paths │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Path 1: VSCode Bridge (HTTP) Path 2: Standalone LSP Server │
|
||||
│ ──────────────────────── ───────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ CCW MCP │ │ Any LSP │ │
|
||||
│ │ vscode_lsp │ │ Client │ │
|
||||
│ └──────┬──────┘ └──────┬──────┘ │
|
||||
│ │ HTTP │ LSP/stdio │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ ccw-vscode │ │ codexlens- │ │
|
||||
│ │ -bridge │ │ lsp │ │
|
||||
│ └──────┬──────┘ └──────┬──────┘ │
|
||||
│ │ VSCode API │ Child Process │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ VSCode │ │ pylsp │ │
|
||||
│ │ LS │ │ tsserver │ │
|
||||
│ └─────────────┘ │ gopls │ │
|
||||
│ └─────────────┘ │
|
||||
│ │
|
||||
│ Path 3: Index-Based (Current) │
|
||||
│ ───────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ CCW MCP │ │
|
||||
│ │codex_lens_lsp│ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ Python subprocess │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ CodexLens │ │
|
||||
│ │ Index DB │ │
|
||||
│ └─────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### B.2 Recommendation Matrix
|
||||
|
||||
| Use Case | Recommended Path | Reason |
|
||||
|----------|-----------------|--------|
|
||||
| Claude Code + VSCode | Path 1: VSCode Bridge | Simplest, full VSCode features |
|
||||
| CLI-only workflows | Path 2: Standalone LSP | No VSCode dependency |
|
||||
| Quick search across indexed code | Path 3: Index-based | Fastest response |
|
||||
| Multi-IDE support | Path 2: Standalone LSP | Standard protocol |
|
||||
| Advanced refactoring | Path 1: VSCode Bridge | Full VSCode capabilities |
|
||||
|
||||
### B.3 Hybrid Mode (Recommended)
|
||||
|
||||
For maximum flexibility, implement all three paths:
|
||||
|
||||
```javascript
|
||||
// Smart routing in CCW
|
||||
function selectLSPPath(request) {
|
||||
// 1. Try VSCode Bridge first (if available)
|
||||
if (await checkVSCodeBridge()) {
|
||||
return "vscode_bridge";
|
||||
}
|
||||
|
||||
// 2. Fall back to Standalone LSP
|
||||
if (await checkStandaloneLSP(request.fileType)) {
|
||||
return "standalone_lsp";
|
||||
}
|
||||
|
||||
// 3. Last resort: Index-based
|
||||
return "index_based";
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Appendix C: Implementation Tasks Summary
|
||||
|
||||
### C.1 VSCode Bridge Tasks
|
||||
|
||||
| Task ID | Description | Priority | Status |
|
||||
|---------|-------------|----------|--------|
|
||||
| VB-1 | Create ccw-vscode-bridge extension structure | High | ✅ Done |
|
||||
| VB-2 | Implement HTTP server in extension.ts | High | ✅ Done |
|
||||
| VB-3 | Create vscode_lsp MCP tool | High | 🔄 Pending |
|
||||
| VB-4 | Register tool in CCW | High | 🔄 Pending |
|
||||
| VB-5 | Test with VSCode | Medium | 🔄 Pending |
|
||||
| VB-6 | Add connection retry logic | Low | 🔄 Pending |
|
||||
|
||||
### C.2 Standalone LSP Server Tasks
|
||||
|
||||
| Task ID | Description | Priority | Status |
|
||||
|---------|-------------|----------|--------|
|
||||
| LSP-1 | Setup pygls project structure | High | 🔄 Pending |
|
||||
| LSP-2 | Implement multiplexer | High | 🔄 Pending |
|
||||
| LSP-3 | Core handlers (definition, references) | High | 🔄 Pending |
|
||||
| LSP-4 | Position tolerance | Medium | 🔄 Pending |
|
||||
| LSP-5 | Tests and documentation | Medium | 🔄 Pending |
|
||||
|
||||
### C.3 Integration Tasks
|
||||
|
||||
| Task ID | Description | Priority | Status |
|
||||
|---------|-------------|----------|--------|
|
||||
| INT-1 | Smart path routing | Medium | 🔄 Pending |
|
||||
| INT-2 | Unified error handling | Medium | 🔄 Pending |
|
||||
| INT-3 | Performance benchmarks | Low | 🔄 Pending |
|
||||
|
||||
---
|
||||
|
||||
## Questions for Clarification
|
||||
|
||||
Before implementation, confirm:
|
||||
|
||||
1. **Implementation Priority**: Start with VSCode Bridge (simpler) or Standalone LSP (more general)?
|
||||
2. **Language Priority**: Which languages are most important? (TypeScript, Python, Go, Rust, etc.)
|
||||
3. **IDE Focus**: Target VS Code first, then others?
|
||||
4. **Fallback Strategy**: Should we keep index-based search as fallback if LSP fails?
|
||||
5. **Caching**: How much should we cache LS responses?
|
||||
6. **Configuration**: Simple JSON config or more sophisticated format?
|
||||
|
||||
@@ -1,192 +0,0 @@
|
||||
# CodexLens 搜索分析 - 执行摘要
|
||||
|
||||
## 🎯 核心发现
|
||||
|
||||
### 问题1:向量搜索为什么返回空结果?
|
||||
|
||||
**根本原因**:向量嵌入数据不存在
|
||||
|
||||
- ✗ `semantic_chunks` 表未创建
|
||||
- ✗ 从未执行向量嵌入生成流程
|
||||
- ✗ 向量索引数据库实际是 SQLite 中的一个表,不是独立文件
|
||||
|
||||
**位置**:向量数据存储在 `~/.codexlens/indexes/项目名/_index.db` 的 `semantic_chunks` 表中
|
||||
|
||||
### 问题2:向量索引数据库在哪里?
|
||||
|
||||
**存储架构**:
|
||||
```
|
||||
~/.codexlens/indexes/
|
||||
└── project-name/
|
||||
└── _index.db ← SQLite数据库
|
||||
├── files ← 文件索引表
|
||||
├── files_fts ← FTS5全文索引
|
||||
├── files_fts_fuzzy ← 模糊搜索索引
|
||||
└── semantic_chunks ← 向量嵌入表(当前不存在!)
|
||||
```
|
||||
|
||||
**不是独立数据库**:向量数据集成在 SQLite 索引文件中,而不是单独的向量数据库。
|
||||
|
||||
### 问题3:当前架构是否发挥了并行效果?
|
||||
|
||||
**✓ 是的!架构非常优秀**
|
||||
|
||||
- **双层并行**:
|
||||
- 第1层:单索引内,exact/fuzzy/vector 三种搜索方法并行
|
||||
- 第2层:跨多个目录索引并行搜索
|
||||
- **性能表现**:混合模式仅增加 1.6x 开销(9ms vs 5.6ms)
|
||||
- **资源利用**:ThreadPoolExecutor 充分利用 I/O 并发
|
||||
|
||||
## ⚡ 快速修复
|
||||
|
||||
### 立即解决向量搜索问题
|
||||
|
||||
**步骤1:安装依赖**
|
||||
```bash
|
||||
pip install codexlens[semantic]
|
||||
# 或
|
||||
pip install fastembed numpy
|
||||
```
|
||||
|
||||
**步骤2:生成向量嵌入**
|
||||
|
||||
创建脚本 `generate_embeddings.py`:
|
||||
```python
|
||||
from pathlib import Path
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
import sqlite3
|
||||
|
||||
def generate_embeddings(index_db_path: Path):
|
||||
embedder = Embedder(profile="code")
|
||||
vector_store = VectorStore(index_db_path)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
|
||||
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
files = conn.execute("SELECT full_path, content FROM files").fetchall()
|
||||
|
||||
for file_row in files:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
file_row["content"],
|
||||
file_path=file_row["full_path"],
|
||||
language="python"
|
||||
)
|
||||
for chunk in chunks:
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
if chunks:
|
||||
vector_store.add_chunks(chunks, file_row["full_path"])
|
||||
```
|
||||
|
||||
**步骤3:执行生成**
|
||||
```bash
|
||||
python generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db
|
||||
```
|
||||
|
||||
**步骤4:验证**
|
||||
```bash
|
||||
# 检查数据
|
||||
sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \
|
||||
"SELECT COUNT(*) FROM semantic_chunks"
|
||||
|
||||
# 测试搜索
|
||||
codexlens search "authentication credentials" --mode vector
|
||||
```
|
||||
|
||||
## 🔍 关键洞察
|
||||
|
||||
### 发现:Vector模式不是纯向量搜索
|
||||
|
||||
**当前行为**:
|
||||
```python
|
||||
# hybrid_search.py:73
|
||||
backends = {"exact": True} # ⚠️ exact搜索总是启用!
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
```
|
||||
|
||||
**影响**:
|
||||
- "vector模式"实际是 **vector + exact 混合模式**
|
||||
- 即使向量搜索返回空,仍有exact FTS结果
|
||||
- 这就是为什么"向量搜索"在无嵌入时也有结果
|
||||
|
||||
**建议修复**:添加 `pure_vector` 参数以支持真正的纯向量搜索
|
||||
|
||||
## 📊 搜索模式对比
|
||||
|
||||
| 模式 | 延迟 | 召回率 | 适用场景 | 需要嵌入 |
|
||||
|------|------|--------|----------|---------|
|
||||
| **exact** | 5.6ms | 中 | 代码标识符 | ✗ |
|
||||
| **fuzzy** | 7.7ms | 高 | 容错搜索 | ✗ |
|
||||
| **vector** | 7.4ms | 最高 | 语义搜索 | ✓ |
|
||||
| **hybrid** | 9.0ms | 最高 | 通用搜索 | ✓ |
|
||||
|
||||
**推荐**:
|
||||
- 代码搜索 → `--mode exact`
|
||||
- 自然语言 → `--mode hybrid`(需先生成嵌入)
|
||||
- 容错搜索 → `--mode fuzzy`
|
||||
|
||||
## 📈 优化路线图
|
||||
|
||||
### P0 - 立即 (本周)
|
||||
- [x] 生成向量嵌入
|
||||
- [ ] 验证向量搜索可用
|
||||
- [ ] 更新使用文档
|
||||
|
||||
### P1 - 短期 (2周)
|
||||
- [ ] 添加 `pure_vector` 模式
|
||||
- [ ] 增量嵌入更新
|
||||
- [ ] 改进错误提示
|
||||
|
||||
### P2 - 中期 (1-2月)
|
||||
- [ ] 混合分块策略
|
||||
- [ ] 查询扩展
|
||||
- [ ] 自适应权重
|
||||
|
||||
### P3 - 长期 (3-6月)
|
||||
- [ ] FAISS加速
|
||||
- [ ] 向量压缩
|
||||
- [ ] 多模态搜索
|
||||
|
||||
## 📚 详细文档
|
||||
|
||||
完整分析报告:`SEARCH_COMPARISON_ANALYSIS.md`
|
||||
|
||||
包含内容:
|
||||
- 详细问题诊断
|
||||
- 架构深度分析
|
||||
- 完整解决方案
|
||||
- 代码示例
|
||||
- 实施检查清单
|
||||
|
||||
## 🎓 学习要点
|
||||
|
||||
1. **向量搜索需要主动生成嵌入**:不会自动创建
|
||||
2. **双层并行架构很优秀**:无需额外优化
|
||||
3. **RRF融合算法工作良好**:多源结果合理融合
|
||||
4. **Vector模式非纯向量**:包含FTS作为后备
|
||||
|
||||
## 💡 下一步行动
|
||||
|
||||
```bash
|
||||
# 1. 安装依赖
|
||||
pip install codexlens[semantic]
|
||||
|
||||
# 2. 创建索引(如果还没有)
|
||||
codexlens init ~/projects/your-project
|
||||
|
||||
# 3. 生成嵌入
|
||||
python generate_embeddings.py ~/.codexlens/indexes/your-project/_index.db
|
||||
|
||||
# 4. 测试搜索
|
||||
codexlens search "your natural language query" --mode hybrid
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**问题解决**: ✓ 已识别并提供解决方案
|
||||
**架构评估**: ✓ 并行架构优秀,充分发挥效能
|
||||
**优化建议**: ✓ 提供短期、中期、长期优化路线
|
||||
|
||||
**联系**: 详见 `SEARCH_COMPARISON_ANALYSIS.md` 获取完整技术细节
|
||||
@@ -1,711 +0,0 @@
|
||||
# CodexLens 搜索模式对比分析报告
|
||||
|
||||
**生成时间**: 2025-12-16
|
||||
**分析目标**: 对比向量搜索和混合搜索效果,诊断向量搜索返回空结果的原因,评估并行架构效能
|
||||
|
||||
---
|
||||
|
||||
## 执行摘要
|
||||
|
||||
通过深入的代码分析和实验测试,我们发现了向量搜索在当前实现中的几个关键问题,并提供了针对性的优化方案。
|
||||
|
||||
### 核心发现
|
||||
|
||||
1. **向量搜索返回空结果的根本原因**:缺少向量嵌入数据(semantic_chunks表为空)
|
||||
2. **混合搜索架构设计优秀**:使用了双层并行架构,性能表现良好
|
||||
3. **向量搜索模式的语义问题**:"vector模式"实际上总是包含exact搜索,不是纯向量搜索
|
||||
|
||||
---
|
||||
|
||||
## 1. 问题诊断
|
||||
|
||||
### 1.1 向量索引数据库位置
|
||||
|
||||
**存储架构**:
|
||||
- **位置**: 向量数据集成存储在SQLite索引文件中(`_index.db`)
|
||||
- **表名**: `semantic_chunks`
|
||||
- **字段结构**:
|
||||
- `id`: 主键
|
||||
- `file_path`: 文件路径
|
||||
- `content`: 代码块内容
|
||||
- `embedding`: 向量嵌入(BLOB格式,numpy float32数组)
|
||||
- `metadata`: JSON格式元数据
|
||||
- `created_at`: 创建时间
|
||||
|
||||
**默认存储路径**:
|
||||
- 全局索引: `~/.codexlens/indexes/`
|
||||
- 项目索引: `项目目录/.codexlens/`
|
||||
- 每个目录一个 `_index.db` 文件
|
||||
|
||||
**为什么没有看到向量数据库**:
|
||||
向量数据不是独立数据库,而是与FTS索引共存于同一个SQLite文件中的`semantic_chunks`表。如果该表不存在或为空,说明从未生成过向量嵌入。
|
||||
|
||||
### 1.2 向量搜索返回空结果的原因
|
||||
|
||||
**代码分析** (`hybrid_search.py:195-253`):
|
||||
|
||||
```python
|
||||
def _search_vector(self, index_path: Path, query: str, limit: int) -> List[SearchResult]:
|
||||
try:
|
||||
# 检查1: semantic_chunks表是否存在
|
||||
conn = sqlite3.connect(index_path)
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
||||
)
|
||||
has_semantic_table = cursor.fetchone() is not None
|
||||
conn.close()
|
||||
|
||||
if not has_semantic_table:
|
||||
self.logger.debug("No semantic_chunks table found")
|
||||
return [] # ❌ 返回空列表
|
||||
|
||||
# 检查2: 向量存储是否有数据
|
||||
vector_store = VectorStore(index_path)
|
||||
if vector_store.count_chunks() == 0:
|
||||
self.logger.debug("Vector store is empty")
|
||||
return [] # ❌ 返回空列表
|
||||
|
||||
# 正常向量搜索流程...
|
||||
except Exception as exc:
|
||||
return [] # ❌ 异常也返回空列表
|
||||
```
|
||||
|
||||
**失败路径**:
|
||||
1. `semantic_chunks`表不存在 → 返回空
|
||||
2. 表存在但无数据 → 返回空
|
||||
3. 语义搜索依赖未安装 → 返回空
|
||||
4. 任何异常 → 返回空
|
||||
|
||||
**当前状态诊断**:
|
||||
通过测试验证,当前项目中:
|
||||
- ✗ `semantic_chunks`表不存在
|
||||
- ✗ 未执行向量嵌入生成流程
|
||||
- ✗ 向量索引从未创建
|
||||
|
||||
**解决方案**:需要执行向量嵌入生成流程(见第3节)
|
||||
|
||||
### 1.3 混合搜索 vs 向量搜索的实际行为
|
||||
|
||||
**重要发现**:当前实现中,"vector模式"并非纯向量搜索。
|
||||
|
||||
**代码证据** (`hybrid_search.py:72-77`):
|
||||
|
||||
```python
|
||||
def search(self, ...):
|
||||
# Determine which backends to use
|
||||
backends = {"exact": True} # ⚠️ exact搜索总是启用!
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
```
|
||||
|
||||
**影响**:
|
||||
- 即使设置为"vector模式"(`enable_fuzzy=False, enable_vector=True`),exact搜索仍然运行
|
||||
- 当向量搜索返回空时,RRF融合仍会包含exact搜索的结果
|
||||
- 这导致"向量搜索"在没有嵌入数据时仍返回结果(来自exact FTS)
|
||||
|
||||
**测试验证**:
|
||||
```
|
||||
测试场景:有FTS索引但无向量嵌入
|
||||
查询:"authentication"
|
||||
|
||||
预期行为(纯向量模式):
|
||||
- 向量搜索: 0 结果(无嵌入数据)
|
||||
- 最终结果: 0
|
||||
|
||||
实际行为:
|
||||
- 向量搜索: 0 结果
|
||||
- Exact搜索: 3 结果 ✓ (总是运行)
|
||||
- 最终结果: 3(来自exact,经过RRF)
|
||||
```
|
||||
|
||||
**设计建议**:
|
||||
1. **选项A(推荐)**: 添加纯向量模式标志
|
||||
```python
|
||||
backends = {}
|
||||
if enable_vector and not pure_vector_mode:
|
||||
backends["exact"] = True # 向量搜索的后备方案
|
||||
elif not enable_vector:
|
||||
backends["exact"] = True # 非向量模式总是启用exact
|
||||
```
|
||||
|
||||
2. **选项B**: 文档明确说明当前行为
|
||||
- "vector模式"实际是"vector+exact混合模式"
|
||||
- 提供警告信息当向量搜索返回空时
|
||||
|
||||
---
|
||||
|
||||
## 2. 并行架构分析
|
||||
|
||||
### 2.1 双层并行设计
|
||||
|
||||
CodexLens采用了优秀的双层并行架构:
|
||||
|
||||
**第一层:搜索方法级并行** (`HybridSearchEngine`)
|
||||
|
||||
```python
|
||||
def _search_parallel(self, index_path, query, backends, limit):
|
||||
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
|
||||
# 并行提交搜索任务
|
||||
if backends.get("exact"):
|
||||
future = executor.submit(self._search_exact, ...)
|
||||
if backends.get("fuzzy"):
|
||||
future = executor.submit(self._search_fuzzy, ...)
|
||||
if backends.get("vector"):
|
||||
future = executor.submit(self._search_vector, ...)
|
||||
|
||||
# 收集结果
|
||||
for future in as_completed(future_to_source):
|
||||
results = future.result()
|
||||
```
|
||||
|
||||
**特点**:
|
||||
- 在**单个索引**内,exact/fuzzy/vector三种搜索方法并行执行
|
||||
- 使用`ThreadPoolExecutor`实现I/O密集型任务并行
|
||||
- 使用`as_completed`实现结果流式收集
|
||||
- 动态worker数量(与启用的backend数量相同)
|
||||
|
||||
**性能测试结果**:
|
||||
```
|
||||
搜索模式 | 平均延迟 | 相对overhead
|
||||
-----------|----------|-------------
|
||||
Exact only | 5.6ms | 1.0x (基线)
|
||||
Fuzzy only | 7.7ms | 1.4x
|
||||
Vector only| 7.4ms | 1.3x
|
||||
Hybrid (all)| 9.0ms | 1.6x
|
||||
```
|
||||
|
||||
**分析**:
|
||||
- ✓ Hybrid模式开销合理(<2x),证明并行有效
|
||||
- ✓ 单次搜索延迟仍保持在10ms以下(优秀)
|
||||
|
||||
**第二层:索引级并行** (`ChainSearchEngine`)
|
||||
|
||||
```python
|
||||
def _search_parallel(self, index_paths, query, options):
|
||||
executor = self._get_executor(options.max_workers)
|
||||
|
||||
# 为每个索引提交搜索任务
|
||||
future_to_path = {
|
||||
executor.submit(
|
||||
self._search_single_index,
|
||||
idx_path, query, ...
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
}
|
||||
|
||||
# 收集所有索引的结果
|
||||
for future in as_completed(future_to_path):
|
||||
results = future.result()
|
||||
all_results.extend(results)
|
||||
```
|
||||
|
||||
**特点**:
|
||||
- 跨**多个目录索引**并行搜索
|
||||
- 共享线程池(避免线程创建开销)
|
||||
- 可配置worker数量(默认8)
|
||||
- 结果去重和RRF融合
|
||||
|
||||
### 2.2 并行效能评估
|
||||
|
||||
**优势**:
|
||||
1. ✓ **架构清晰**:双层并行职责明确,互不干扰
|
||||
2. ✓ **资源利用**:I/O密集型任务充分利用线程池
|
||||
3. ✓ **扩展性**:易于添加新的搜索后端
|
||||
4. ✓ **容错性**:单个后端失败不影响其他后端
|
||||
|
||||
**当前利用率**:
|
||||
- 单索引搜索:并行度 = min(3, 启用的backend数量)
|
||||
- 多索引搜索:并行度 = min(8, 索引数量)
|
||||
- **充分发挥**:只要有多个索引或多个backend
|
||||
|
||||
**潜在优化点**:
|
||||
1. **CPU密集型任务**:向量相似度计算已使用numpy向量化,无需额外并行
|
||||
2. **缓存优化**:`VectorStore`已实现embedding matrix缓存,性能良好
|
||||
3. **动态worker调度**:当前固定worker数,可根据任务负载动态调整
|
||||
|
||||
---
|
||||
|
||||
## 3. 解决方案与优化建议
|
||||
|
||||
### 3.1 立即修复:生成向量嵌入
|
||||
|
||||
**步骤1:安装语义搜索依赖**
|
||||
|
||||
```bash
|
||||
# 方式A:完整安装
|
||||
pip install codexlens[semantic]
|
||||
|
||||
# 方式B:手动安装依赖
|
||||
pip install fastembed numpy
|
||||
```
|
||||
|
||||
**步骤2:创建向量索引脚本**
|
||||
|
||||
保存为 `scripts/generate_embeddings.py`:
|
||||
|
||||
```python
|
||||
"""Generate vector embeddings for existing indexes."""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def generate_embeddings_for_index(index_db_path: Path):
|
||||
"""Generate embeddings for all files in an index."""
|
||||
logger.info(f"Processing index: {index_db_path}")
|
||||
|
||||
# Initialize components
|
||||
embedder = Embedder(profile="code") # Use code-optimized model
|
||||
vector_store = VectorStore(index_db_path)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
|
||||
|
||||
# Read files from index
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
||||
files = cursor.fetchall()
|
||||
|
||||
logger.info(f"Found {len(files)} files to process")
|
||||
|
||||
# Process each file
|
||||
total_chunks = 0
|
||||
for file_row in files:
|
||||
file_path = file_row["full_path"]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
# Create chunks
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
logger.debug(f"No chunks created for {file_path}")
|
||||
continue
|
||||
|
||||
# Generate embeddings
|
||||
for chunk in chunks:
|
||||
embedding = embedder.embed_single(chunk.content)
|
||||
chunk.embedding = embedding
|
||||
|
||||
# Store chunks
|
||||
vector_store.add_chunks(chunks, file_path)
|
||||
total_chunks += len(chunks)
|
||||
logger.info(f"✓ {file_path}: {len(chunks)} chunks")
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"✗ {file_path}: {exc}")
|
||||
|
||||
logger.info(f"Completed: {total_chunks} total chunks indexed")
|
||||
return total_chunks
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python generate_embeddings.py <index_db_path>")
|
||||
print("Example: python generate_embeddings.py ~/.codexlens/indexes/project/_index.db")
|
||||
sys.exit(1)
|
||||
|
||||
index_path = Path(sys.argv[1])
|
||||
|
||||
if not index_path.exists():
|
||||
print(f"Error: Index not found at {index_path}")
|
||||
sys.exit(1)
|
||||
|
||||
generate_embeddings_for_index(index_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
**步骤3:执行生成**
|
||||
|
||||
```bash
|
||||
# 为特定项目生成嵌入
|
||||
python scripts/generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db
|
||||
|
||||
# 或使用find批量处理
|
||||
find ~/.codexlens/indexes -name "_index.db" -type f | while read db; do
|
||||
python scripts/generate_embeddings.py "$db"
|
||||
done
|
||||
```
|
||||
|
||||
**步骤4:验证生成结果**
|
||||
|
||||
```bash
|
||||
# 检查semantic_chunks表
|
||||
sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \
|
||||
"SELECT COUNT(*) as chunk_count FROM semantic_chunks"
|
||||
|
||||
# 测试向量搜索
|
||||
codexlens search "authentication user credentials" \
|
||||
--path ~/projects/codex-lens \
|
||||
--mode vector
|
||||
```
|
||||
|
||||
### 3.2 短期优化:改进向量搜索语义
|
||||
|
||||
**问题**:当前"vector模式"实际包含exact搜索,语义不清晰
|
||||
|
||||
**解决方案**:添加`pure_vector`参数
|
||||
|
||||
**实现** (修改 `hybrid_search.py`):
|
||||
|
||||
```python
|
||||
class HybridSearchEngine:
|
||||
def search(
|
||||
self,
|
||||
index_path: Path,
|
||||
query: str,
|
||||
limit: int = 20,
|
||||
enable_fuzzy: bool = True,
|
||||
enable_vector: bool = False,
|
||||
pure_vector: bool = False, # 新增参数
|
||||
) -> List[SearchResult]:
|
||||
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
||||
|
||||
Args:
|
||||
...
|
||||
pure_vector: If True, only use vector search (no FTS fallback)
|
||||
"""
|
||||
# Determine which backends to use
|
||||
backends = {}
|
||||
|
||||
if pure_vector:
|
||||
# 纯向量模式:只使用向量搜索
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
else:
|
||||
# 混合模式:总是包含exact搜索作为基线
|
||||
backends["exact"] = True
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
|
||||
# ... rest of the method
|
||||
```
|
||||
|
||||
**CLI更新** (修改 `commands.py`):
|
||||
|
||||
```python
|
||||
@app.command()
|
||||
def search(
|
||||
...
|
||||
mode: str = typer.Option("exact", "--mode", "-m",
|
||||
help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
|
||||
...
|
||||
):
|
||||
"""...
|
||||
Search Modes:
|
||||
- exact: Exact FTS
|
||||
- fuzzy: Fuzzy FTS
|
||||
- hybrid: RRF fusion of exact + fuzzy + vector (recommended)
|
||||
- vector: Vector search with exact FTS fallback
|
||||
- pure-vector: Pure semantic vector search (no FTS fallback)
|
||||
"""
|
||||
...
|
||||
|
||||
# Map mode to options
|
||||
if mode == "exact":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
|
||||
elif mode == "fuzzy":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
|
||||
elif mode == "vector":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False
|
||||
elif mode == "pure-vector":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True
|
||||
elif mode == "hybrid":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
|
||||
```
|
||||
|
||||
### 3.3 中期优化:增强向量搜索效果
|
||||
|
||||
**优化1:改进分块策略**
|
||||
|
||||
当前使用简单的滑动窗口,可优化为:
|
||||
|
||||
```python
|
||||
class HybridChunker(Chunker):
|
||||
"""Hybrid chunking strategy combining symbol-based and sliding window."""
|
||||
|
||||
def chunk_hybrid(
|
||||
self,
|
||||
content: str,
|
||||
symbols: List[Symbol],
|
||||
file_path: str,
|
||||
language: str,
|
||||
) -> List[SemanticChunk]:
|
||||
"""
|
||||
1. 优先按symbol分块(函数、类级别)
|
||||
2. 对过大symbol,进一步使用滑动窗口
|
||||
3. 对symbol间隙,使用滑动窗口补充
|
||||
"""
|
||||
chunks = []
|
||||
|
||||
# Step 1: Symbol-based chunks
|
||||
symbol_chunks = self.chunk_by_symbol(content, symbols, file_path, language)
|
||||
|
||||
# Step 2: Split oversized symbols
|
||||
for chunk in symbol_chunks:
|
||||
if chunk.token_count > self.config.max_chunk_size:
|
||||
# 使用滑动窗口进一步分割
|
||||
sub_chunks = self._split_large_chunk(chunk)
|
||||
chunks.extend(sub_chunks)
|
||||
else:
|
||||
chunks.append(chunk)
|
||||
|
||||
# Step 3: Fill gaps with sliding window
|
||||
gap_chunks = self._chunk_gaps(content, symbols, file_path, language)
|
||||
chunks.extend(gap_chunks)
|
||||
|
||||
return chunks
|
||||
```
|
||||
|
||||
**优化2:添加查询扩展**
|
||||
|
||||
```python
|
||||
class QueryExpander:
|
||||
"""Expand queries for better vector search recall."""
|
||||
|
||||
def expand(self, query: str) -> str:
|
||||
"""Expand query with synonyms and related terms."""
|
||||
# 示例:代码领域同义词
|
||||
expansions = {
|
||||
"auth": ["authentication", "authorization", "login"],
|
||||
"db": ["database", "storage", "repository"],
|
||||
"api": ["endpoint", "route", "interface"],
|
||||
}
|
||||
|
||||
terms = query.lower().split()
|
||||
expanded = set(terms)
|
||||
|
||||
for term in terms:
|
||||
if term in expansions:
|
||||
expanded.update(expansions[term])
|
||||
|
||||
return " ".join(expanded)
|
||||
```
|
||||
|
||||
**优化3:混合检索策略**
|
||||
|
||||
```python
|
||||
class AdaptiveHybridSearch:
|
||||
"""Adaptive search strategy based on query type."""
|
||||
|
||||
def search(self, query: str, ...):
|
||||
# 分析查询类型
|
||||
query_type = self._classify_query(query)
|
||||
|
||||
if query_type == "keyword":
|
||||
# 代码标识符查询 → 偏重FTS
|
||||
weights = {"exact": 0.5, "fuzzy": 0.3, "vector": 0.2}
|
||||
elif query_type == "semantic":
|
||||
# 自然语言查询 → 偏重向量
|
||||
weights = {"exact": 0.2, "fuzzy": 0.2, "vector": 0.6}
|
||||
elif query_type == "hybrid":
|
||||
# 混合查询 → 平衡权重
|
||||
weights = {"exact": 0.4, "fuzzy": 0.3, "vector": 0.3}
|
||||
|
||||
return self.engine.search(query, weights=weights, ...)
|
||||
```
|
||||
|
||||
### 3.4 长期优化:性能与质量提升
|
||||
|
||||
**优化1:增量嵌入更新**
|
||||
|
||||
```python
|
||||
class IncrementalEmbeddingUpdater:
|
||||
"""Update embeddings incrementally for changed files."""
|
||||
|
||||
def update_for_file(self, file_path: str, new_content: str):
|
||||
"""Only regenerate embeddings for changed file."""
|
||||
# 1. 删除旧嵌入
|
||||
self.vector_store.delete_file_chunks(file_path)
|
||||
|
||||
# 2. 生成新嵌入
|
||||
chunks = self.chunker.chunk(new_content, ...)
|
||||
for chunk in chunks:
|
||||
chunk.embedding = self.embedder.embed_single(chunk.content)
|
||||
|
||||
# 3. 存储新嵌入
|
||||
self.vector_store.add_chunks(chunks, file_path)
|
||||
```
|
||||
|
||||
**优化2:向量索引压缩**
|
||||
|
||||
```python
|
||||
# 使用量化技术减少存储空间(768维 → 192维)
|
||||
from qdrant_client import models
|
||||
|
||||
# 产品量化(PQ)压缩
|
||||
compressed_vector = pq_quantize(embedding, target_dim=192)
|
||||
```
|
||||
|
||||
**优化3:向量搜索加速**
|
||||
|
||||
```python
|
||||
# 使用FAISS或Hnswlib替代numpy暴力搜索
|
||||
import faiss
|
||||
|
||||
class FAISSVectorStore(VectorStore):
|
||||
def __init__(self, db_path, dim=768):
|
||||
super().__init__(db_path)
|
||||
# 使用HNSW索引
|
||||
self.index = faiss.IndexHNSWFlat(dim, 32)
|
||||
self._load_vectors_to_index()
|
||||
|
||||
def search_similar(self, query_embedding, top_k=10):
|
||||
# FAISS加速搜索(100x+)
|
||||
scores, indices = self.index.search(
|
||||
np.array([query_embedding]), top_k
|
||||
)
|
||||
return self._fetch_by_indices(indices[0], scores[0])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. 对比总结
|
||||
|
||||
### 4.1 搜索模式对比
|
||||
|
||||
| 维度 | Exact FTS | Fuzzy FTS | Vector Search | Hybrid (推荐) |
|
||||
|------|-----------|-----------|---------------|--------------|
|
||||
| **匹配类型** | 精确词匹配 | 容错匹配 | 语义相似 | 多模式融合 |
|
||||
| **查询类型** | 标识符、关键词 | 拼写错误容忍 | 自然语言 | 所有类型 |
|
||||
| **召回率** | 中 | 高 | 最高 | 最高 |
|
||||
| **精确率** | 高 | 中 | 中 | 高 |
|
||||
| **延迟** | 5-7ms | 7-9ms | 7-10ms | 9-11ms |
|
||||
| **依赖** | 仅SQLite | 仅SQLite | fastembed+numpy | 全部 |
|
||||
| **存储开销** | 小(FTS索引) | 小(FTS索引) | 大(向量) | 大(FTS+向量) |
|
||||
| **适用场景** | 代码搜索 | 容错搜索 | 概念搜索 | 通用搜索 |
|
||||
|
||||
### 4.2 推荐使用策略
|
||||
|
||||
**场景1:代码标识符搜索**(函数名、类名、变量名)
|
||||
```bash
|
||||
codexlens search "authenticate_user" --mode exact
|
||||
```
|
||||
→ 使用exact模式,最快且最精确
|
||||
|
||||
**场景2:概念性搜索**("如何验证用户身份")
|
||||
```bash
|
||||
codexlens search "how to verify user credentials" --mode hybrid
|
||||
```
|
||||
→ 使用hybrid模式,结合语义和关键词
|
||||
|
||||
**场景3:容错搜索**(允许拼写错误)
|
||||
```bash
|
||||
codexlens search "autheticate" --mode fuzzy
|
||||
```
|
||||
→ 使用fuzzy模式,trigram容错
|
||||
|
||||
**场景4:纯语义搜索**(需先生成嵌入)
|
||||
```bash
|
||||
codexlens search "password encryption with salt" --mode pure-vector
|
||||
```
|
||||
→ 使用pure-vector模式,理解语义意图
|
||||
|
||||
---
|
||||
|
||||
## 5. 实施检查清单
|
||||
|
||||
### 立即行动项 (P0)
|
||||
|
||||
- [ ] 安装语义搜索依赖:`pip install codexlens[semantic]`
|
||||
- [ ] 运行嵌入生成脚本(见3.1节)
|
||||
- [ ] 验证semantic_chunks表已创建且有数据
|
||||
- [ ] 测试vector模式搜索是否返回结果
|
||||
|
||||
### 短期改进 (P1)
|
||||
|
||||
- [ ] 添加pure_vector参数(见3.2节)
|
||||
- [ ] 更新CLI支持pure-vector模式
|
||||
- [ ] 添加嵌入生成进度提示
|
||||
- [ ] 文档更新:搜索模式使用指南
|
||||
|
||||
### 中期优化 (P2)
|
||||
|
||||
- [ ] 实现混合分块策略(见3.3节)
|
||||
- [ ] 添加查询扩展功能
|
||||
- [ ] 实现自适应权重调整
|
||||
- [ ] 性能基准测试
|
||||
|
||||
### 长期规划 (P3)
|
||||
|
||||
- [ ] 增量嵌入更新机制
|
||||
- [ ] 向量索引压缩
|
||||
- [ ] 集成FAISS加速
|
||||
- [ ] 多模态搜索(代码+文档)
|
||||
|
||||
---
|
||||
|
||||
## 6. 参考资源
|
||||
|
||||
### 代码文件
|
||||
|
||||
- 混合搜索引擎: `codex-lens/src/codexlens/search/hybrid_search.py`
|
||||
- 向量存储: `codex-lens/src/codexlens/semantic/vector_store.py`
|
||||
- 向量嵌入: `codex-lens/src/codexlens/semantic/embedder.py`
|
||||
- 代码分块: `codex-lens/src/codexlens/semantic/chunker.py`
|
||||
- 链式搜索: `codex-lens/src/codexlens/search/chain_search.py`
|
||||
|
||||
### 测试文件
|
||||
|
||||
- 对比测试: `codex-lens/tests/test_search_comparison.py`
|
||||
- 混合搜索E2E: `codex-lens/tests/test_hybrid_search_e2e.py`
|
||||
- CLI测试: `codex-lens/tests/test_cli_hybrid_search.py`
|
||||
|
||||
### 相关文档
|
||||
|
||||
- RRF算法: `codex-lens/src/codexlens/search/ranking.py`
|
||||
- 查询解析: `codex-lens/src/codexlens/search/query_parser.py`
|
||||
- 配置管理: `codex-lens/src/codexlens/config.py`
|
||||
|
||||
---
|
||||
|
||||
## 7. 结论
|
||||
|
||||
通过本次深入分析,我们明确了CodexLens搜索系统的优势和待优化点:
|
||||
|
||||
**优势**:
|
||||
1. ✓ 优秀的并行架构设计(双层并行)
|
||||
2. ✓ RRF融合算法实现合理
|
||||
3. ✓ 向量存储实现高效(numpy向量化+缓存)
|
||||
4. ✓ 模块化设计,易于扩展
|
||||
|
||||
**待优化**:
|
||||
1. 向量嵌入生成流程需要手动触发
|
||||
2. "vector模式"语义不清晰(实际包含exact搜索)
|
||||
3. 分块策略可以优化(混合策略)
|
||||
4. 缺少增量更新机制
|
||||
|
||||
**核心建议**:
|
||||
1. **立即**: 生成向量嵌入,解决返回空结果问题
|
||||
2. **短期**: 添加纯向量模式,澄清语义
|
||||
3. **中期**: 优化分块和查询策略,提升搜索质量
|
||||
4. **长期**: 性能优化和高级特性
|
||||
|
||||
通过实施这些改进,CodexLens的搜索功能将达到生产级别的质量和性能标准。
|
||||
|
||||
---
|
||||
|
||||
**报告完成时间**: 2025-12-16
|
||||
**分析工具**: 代码静态分析 + 实验测试 + 性能测评
|
||||
**下一步**: 实施P0优先级改进项
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,248 +0,0 @@
|
||||
# T6: CLI Integration for Hybrid Search - Implementation Summary
|
||||
|
||||
## Overview
|
||||
|
||||
Successfully integrated hybrid search capabilities into the CodexLens CLI with user-configurable options, migration support, and enhanced status reporting.
|
||||
|
||||
## Changes Made
|
||||
|
||||
### 1. Search Command Enhancement (`commands.py`)
|
||||
|
||||
**New `--mode` Parameter:**
|
||||
- Replaced `--hybrid` and `--exact-only` flags with unified `--mode` parameter
|
||||
- Supported modes: `exact`, `fuzzy`, `hybrid`, `vector`
|
||||
- Default: `exact` (backward compatible)
|
||||
|
||||
**Mode Validation:**
|
||||
```python
|
||||
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
|
||||
if mode not in valid_modes:
|
||||
# Error with helpful message
|
||||
```
|
||||
|
||||
**Weights Configuration:**
|
||||
- Accepts custom RRF weights via `--weights exact,fuzzy,vector`
|
||||
- Example: `--weights 0.5,0.3,0.2`
|
||||
- Automatic normalization if weights don't sum to 1.0
|
||||
- Validation for 3-value format
|
||||
|
||||
**Mode Mapping to SearchOptions:**
|
||||
```python
|
||||
hybrid_mode = mode == "hybrid"
|
||||
enable_fuzzy = mode in ["fuzzy", "hybrid"]
|
||||
|
||||
options = SearchOptions(
|
||||
hybrid_mode=hybrid_mode,
|
||||
enable_fuzzy=enable_fuzzy,
|
||||
hybrid_weights=hybrid_weights,
|
||||
)
|
||||
```
|
||||
|
||||
**Enhanced Output:**
|
||||
- Shows search mode in status line
|
||||
- Includes search source tags in verbose mode
|
||||
- JSON output includes mode and source information
|
||||
|
||||
### 2. Migrate Command (`commands.py`)
|
||||
|
||||
**New Command for Dual-FTS Upgrade:**
|
||||
```bash
|
||||
codex-lens migrate [path]
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Upgrades all `_index.db` files to schema version 4
|
||||
- Shows progress bar with percentage complete
|
||||
- Tracks: migrated, already up-to-date, errors
|
||||
- Safe operation preserving all data
|
||||
- Verbose mode shows per-database migration details
|
||||
|
||||
**Progress Tracking:**
|
||||
- Uses Rich progress bar with spinner
|
||||
- Shows percentage and count (N/Total)
|
||||
- Time elapsed indicator
|
||||
|
||||
### 3. Status Command Enhancement (`commands.py`)
|
||||
|
||||
**New Backend Status Display:**
|
||||
```
|
||||
Search Backends:
|
||||
Exact FTS: ✓ (unicode61)
|
||||
Fuzzy FTS: ✓ (trigram)
|
||||
Hybrid Search: ✓ (RRF fusion)
|
||||
Vector Search: ✗ (future)
|
||||
```
|
||||
|
||||
**Schema Version Detection:**
|
||||
- Checks first available `_index.db`
|
||||
- Reports schema version
|
||||
- Detects dual FTS table presence
|
||||
|
||||
**Feature Flags in JSON:**
|
||||
```json
|
||||
{
|
||||
"features": {
|
||||
"exact_fts": true,
|
||||
"fuzzy_fts": true,
|
||||
"hybrid_search": true,
|
||||
"vector_search": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Output Rendering (`output.py`)
|
||||
|
||||
**Verbose Mode Support:**
|
||||
```python
|
||||
render_search_results(results, verbose=True)
|
||||
```
|
||||
|
||||
**Search Source Tags:**
|
||||
- `[E]` - Exact FTS result
|
||||
- `[F]` - Fuzzy FTS result
|
||||
- `[V]` - Vector search result
|
||||
- `[RRF]` - Fusion result
|
||||
|
||||
**Enhanced Table:**
|
||||
- New "Source" column in verbose mode
|
||||
- Shows result origin for debugging
|
||||
- Fusion scores visible
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### 1. Search with Different Modes
|
||||
|
||||
```bash
|
||||
# Exact search (default)
|
||||
codex-lens search "authentication"
|
||||
|
||||
# Fuzzy search only
|
||||
codex-lens search "authentication" --mode fuzzy
|
||||
|
||||
# Hybrid search with RRF fusion
|
||||
codex-lens search "authentication" --mode hybrid
|
||||
|
||||
# Hybrid with custom weights
|
||||
codex-lens search "authentication" --mode hybrid --weights 0.5,0.3,0.2
|
||||
|
||||
# Verbose mode shows source tags
|
||||
codex-lens search "authentication" --mode hybrid -v
|
||||
```
|
||||
|
||||
### 2. Migration
|
||||
|
||||
```bash
|
||||
# Migrate current project
|
||||
codex-lens migrate
|
||||
|
||||
# Migrate specific project with verbose output
|
||||
codex-lens migrate /path/to/project -v
|
||||
|
||||
# JSON output for automation
|
||||
codex-lens migrate --json
|
||||
```
|
||||
|
||||
### 3. Status Checking
|
||||
|
||||
```bash
|
||||
# Check backend availability
|
||||
codex-lens status
|
||||
|
||||
# JSON output with feature flags
|
||||
codex-lens status --json
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
**Test Coverage:**
|
||||
- ✅ Mode parameter validation (exact, fuzzy, hybrid, vector)
|
||||
- ✅ Weights parsing and normalization
|
||||
- ✅ Help text shows all modes
|
||||
- ✅ Migrate command exists and accessible
|
||||
- ✅ Status command shows backends
|
||||
- ✅ Mode mapping to SearchOptions
|
||||
|
||||
**Test Results:**
|
||||
```
|
||||
11 passed in 2.27s
|
||||
```
|
||||
|
||||
## Integration Points
|
||||
|
||||
### With Phase 1 (Dual-FTS):
|
||||
- Uses `search_fts_exact()` for exact mode
|
||||
- Uses `search_fts_fuzzy()` for fuzzy mode
|
||||
- Schema migration via `_apply_migrations()`
|
||||
|
||||
### With Phase 2 (Hybrid Search):
|
||||
- Calls `HybridSearchEngine` for hybrid mode
|
||||
- Passes custom weights to RRF algorithm
|
||||
- Displays fusion scores and source tags
|
||||
|
||||
### With Existing CLI:
|
||||
- Backward compatible (default mode=exact)
|
||||
- Follows existing error handling patterns
|
||||
- Uses Rich for progress and formatting
|
||||
- Supports JSON output mode
|
||||
|
||||
## Done Criteria Verification
|
||||
|
||||
✅ **CLI search --mode exact uses only exact FTS table**
|
||||
- Mode validation ensures correct backend selection
|
||||
- `hybrid_mode=False, enable_fuzzy=False` for exact mode
|
||||
|
||||
✅ **--mode fuzzy uses only fuzzy table**
|
||||
- `hybrid_mode=False, enable_fuzzy=True` for fuzzy mode
|
||||
- Single backend execution
|
||||
|
||||
✅ **--mode hybrid fuses both**
|
||||
- `hybrid_mode=True, enable_fuzzy=True` activates RRF fusion
|
||||
- HybridSearchEngine coordinates parallel search
|
||||
|
||||
✅ **Custom weights via --weights 0.5,0.3,0.2**
|
||||
- Parses 3-value comma-separated format
|
||||
- Validates and normalizes to sum=1.0
|
||||
- Passes to RRF algorithm
|
||||
|
||||
✅ **Migration command completes Dual-FTS upgrade**
|
||||
- Shows progress bar with percentage
|
||||
- Tracks migration status per database
|
||||
- Safe operation with error handling
|
||||
|
||||
✅ **Search output shows [E], [F], [V] tags and fusion scores**
|
||||
- Verbose mode displays Source column
|
||||
- Tags extracted from `search_source` attribute
|
||||
- Fusion scores shown in Score column
|
||||
|
||||
## Files Modified
|
||||
|
||||
1. `codex-lens/src/codexlens/cli/commands.py`
|
||||
- Updated `search()` command with `--mode` parameter
|
||||
- Added `migrate()` command
|
||||
- Enhanced `status()` command
|
||||
- Added DirIndexStore import
|
||||
|
||||
2. `codex-lens/src/codexlens/cli/output.py`
|
||||
- Updated `render_search_results()` with verbose mode
|
||||
- Added source tag display logic
|
||||
|
||||
3. `codex-lens/tests/test_cli_hybrid_search.py` (new)
|
||||
- Comprehensive CLI integration tests
|
||||
- Mode validation tests
|
||||
- Weights parsing tests
|
||||
- Command availability tests
|
||||
|
||||
## Performance Impact
|
||||
|
||||
- **Exact mode**: Same as before (no overhead)
|
||||
- **Fuzzy mode**: Single FTS query (minimal overhead)
|
||||
- **Hybrid mode**: Parallel execution (2x I/O, no sequential penalty)
|
||||
- **Migration**: One-time operation, safe for large projects
|
||||
|
||||
## Next Steps
|
||||
|
||||
Users can now:
|
||||
1. Run `codex-lens migrate` to upgrade existing indexes
|
||||
2. Use `codex-lens search "query" --mode hybrid` for best results
|
||||
3. Check `codex-lens status` to verify enabled features
|
||||
4. Tune fusion weights for their use case via `--weights`
|
||||
@@ -1,459 +0,0 @@
|
||||
MCP integration
|
||||
mcp_servers
|
||||
You can configure Codex to use MCP servers to give Codex access to external applications, resources, or services.
|
||||
|
||||
Server configuration
|
||||
STDIO
|
||||
STDIO servers are MCP servers that you can launch directly via commands on your computer.
|
||||
|
||||
# The top-level table name must be `mcp_servers`
|
||||
# The sub-table name (`server-name` in this example) can be anything you would like.
|
||||
[mcp_servers.server_name]
|
||||
command = "npx"
|
||||
# Optional
|
||||
args = ["-y", "mcp-server"]
|
||||
# Optional: propagate additional env vars to the MCP server.
|
||||
# A default whitelist of env vars will be propagated to the MCP server.
|
||||
# https://github.com/openai/codex/blob/main/codex-rs/rmcp-client/src/utils.rs#L82
|
||||
env = { "API_KEY" = "value" }
|
||||
# or
|
||||
[mcp_servers.server_name.env]
|
||||
API_KEY = "value"
|
||||
# Optional: Additional list of environment variables that will be whitelisted in the MCP server's environment.
|
||||
env_vars = ["API_KEY2"]
|
||||
|
||||
# Optional: cwd that the command will be run from
|
||||
cwd = "/Users/<user>/code/my-server"
|
||||
Streamable HTTP
|
||||
Streamable HTTP servers enable Codex to talk to resources that are accessed via a http url (either on localhost or another domain).
|
||||
|
||||
[mcp_servers.figma]
|
||||
url = "https://mcp.figma.com/mcp"
|
||||
# Optional environment variable containing a bearer token to use for auth
|
||||
bearer_token_env_var = "ENV_VAR"
|
||||
# Optional map of headers with hard-coded values.
|
||||
http_headers = { "HEADER_NAME" = "HEADER_VALUE" }
|
||||
# Optional map of headers whose values will be replaced with the environment variable.
|
||||
env_http_headers = { "HEADER_NAME" = "ENV_VAR" }
|
||||
Streamable HTTP connections always use the experimental Rust MCP client under the hood, so expect occasional rough edges. OAuth login flows are gated on the rmcp_client = true flag:
|
||||
|
||||
[features]
|
||||
rmcp_client = true
|
||||
After enabling it, run codex mcp login <server-name> when the server supports OAuth.
|
||||
|
||||
Other configuration options
|
||||
# Optional: override the default 10s startup timeout
|
||||
startup_timeout_sec = 20
|
||||
# Optional: override the default 60s per-tool timeout
|
||||
tool_timeout_sec = 30
|
||||
# Optional: disable a server without removing it
|
||||
enabled = false
|
||||
# Optional: only expose a subset of tools from this server
|
||||
enabled_tools = ["search", "summarize"]
|
||||
# Optional: hide specific tools (applied after `enabled_tools`, if set)
|
||||
disabled_tools = ["search"]
|
||||
When both enabled_tools and disabled_tools are specified, Codex first restricts the server to the allow-list and then removes any tools that appear in the deny-list.
|
||||
|
||||
MCP CLI commands
|
||||
# List all available commands
|
||||
codex mcp --help
|
||||
|
||||
# Add a server (env can be repeated; `--` separates the launcher command)
|
||||
codex mcp add docs -- docs-server --port 4000
|
||||
|
||||
# List configured servers (pretty table or JSON)
|
||||
codex mcp list
|
||||
codex mcp list --json
|
||||
|
||||
# Show one server (table or JSON)
|
||||
codex mcp get docs
|
||||
codex mcp get docs --json
|
||||
|
||||
# Remove a server
|
||||
codex mcp remove docs
|
||||
|
||||
# Log in to a streamable HTTP server that supports oauth
|
||||
codex mcp login SERVER_NAME
|
||||
|
||||
# Log out from a streamable HTTP server that supports oauth
|
||||
codex mcp logout SERVER_NAME
|
||||
Examples of useful MCPs
|
||||
There is an ever growing list of useful MCP servers that can be helpful while you are working with Codex.
|
||||
|
||||
Some of the most common MCPs we've seen are:
|
||||
|
||||
Context7 — connect to a wide range of up-to-date developer documentation
|
||||
Figma Local and Remote - access to your Figma designs
|
||||
Playwright - control and inspect a browser using Playwright
|
||||
Chrome Developer Tools — control and inspect a Chrome browser
|
||||
Sentry — access to your Sentry logs
|
||||
GitHub — Control over your GitHub account beyond what git allows (like controlling PRs, issues, etc.)
|
||||
|
||||
|
||||
# Example config.toml
|
||||
|
||||
Use this example configuration as a starting point. For an explanation of each field and additional context, see [Configuration](./config.md). Copy the snippet below to `~/.codex/config.toml` and adjust values as needed.
|
||||
|
||||
```toml
|
||||
# Codex example configuration (config.toml)
|
||||
#
|
||||
# This file lists all keys Codex reads from config.toml, their default values,
|
||||
# and concise explanations. Values here mirror the effective defaults compiled
|
||||
# into the CLI. Adjust as needed.
|
||||
#
|
||||
# Notes
|
||||
# - Root keys must appear before tables in TOML.
|
||||
# - Optional keys that default to "unset" are shown commented out with notes.
|
||||
# - MCP servers, profiles, and model providers are examples; remove or edit.
|
||||
|
||||
################################################################################
|
||||
# Core Model Selection
|
||||
################################################################################
|
||||
|
||||
# Primary model used by Codex. Default: "gpt-5.1-codex-max" on all platforms.
|
||||
model = "gpt-5.1-codex-max"
|
||||
|
||||
# Model used by the /review feature (code reviews). Default: "gpt-5.1-codex-max".
|
||||
review_model = "gpt-5.1-codex-max"
|
||||
|
||||
# Provider id selected from [model_providers]. Default: "openai".
|
||||
model_provider = "openai"
|
||||
|
||||
# Optional manual model metadata. When unset, Codex auto-detects from model.
|
||||
# Uncomment to force values.
|
||||
# model_context_window = 128000 # tokens; default: auto for model
|
||||
# model_auto_compact_token_limit = 0 # disable/override auto; default: model family specific
|
||||
# tool_output_token_limit = 10000 # tokens stored per tool output; default: 10000 for gpt-5.1-codex-max
|
||||
|
||||
################################################################################
|
||||
# Reasoning & Verbosity (Responses API capable models)
|
||||
################################################################################
|
||||
|
||||
# Reasoning effort: minimal | low | medium | high | xhigh (default: medium; xhigh on gpt-5.1-codex-max and gpt-5.2)
|
||||
model_reasoning_effort = "medium"
|
||||
|
||||
# Reasoning summary: auto | concise | detailed | none (default: auto)
|
||||
model_reasoning_summary = "auto"
|
||||
|
||||
# Text verbosity for GPT-5 family (Responses API): low | medium | high (default: medium)
|
||||
model_verbosity = "medium"
|
||||
|
||||
# Force-enable reasoning summaries for current model (default: false)
|
||||
model_supports_reasoning_summaries = false
|
||||
|
||||
# Force reasoning summary format: none | experimental (default: none)
|
||||
model_reasoning_summary_format = "none"
|
||||
|
||||
################################################################################
|
||||
# Instruction Overrides
|
||||
################################################################################
|
||||
|
||||
# Additional user instructions appended after AGENTS.md. Default: unset.
|
||||
# developer_instructions = ""
|
||||
|
||||
# Optional legacy base instructions override (prefer AGENTS.md). Default: unset.
|
||||
# instructions = ""
|
||||
|
||||
# Inline override for the history compaction prompt. Default: unset.
|
||||
# compact_prompt = ""
|
||||
|
||||
# Override built-in base instructions with a file path. Default: unset.
|
||||
# experimental_instructions_file = "/absolute/or/relative/path/to/instructions.txt"
|
||||
|
||||
# Load the compact prompt override from a file. Default: unset.
|
||||
# experimental_compact_prompt_file = "/absolute/or/relative/path/to/compact_prompt.txt"
|
||||
|
||||
################################################################################
|
||||
# Approval & Sandbox
|
||||
################################################################################
|
||||
|
||||
# When to ask for command approval:
|
||||
# - untrusted: only known-safe read-only commands auto-run; others prompt
|
||||
# - on-failure: auto-run in sandbox; prompt only on failure for escalation
|
||||
# - on-request: model decides when to ask (default)
|
||||
# - never: never prompt (risky)
|
||||
approval_policy = "on-request"
|
||||
|
||||
# Filesystem/network sandbox policy for tool calls:
|
||||
# - read-only (default)
|
||||
# - workspace-write
|
||||
# - danger-full-access (no sandbox; extremely risky)
|
||||
sandbox_mode = "read-only"
|
||||
|
||||
# Extra settings used only when sandbox_mode = "workspace-write".
|
||||
[sandbox_workspace_write]
|
||||
# Additional writable roots beyond the workspace (cwd). Default: []
|
||||
writable_roots = []
|
||||
# Allow outbound network access inside the sandbox. Default: false
|
||||
network_access = false
|
||||
# Exclude $TMPDIR from writable roots. Default: false
|
||||
exclude_tmpdir_env_var = false
|
||||
# Exclude /tmp from writable roots. Default: false
|
||||
exclude_slash_tmp = false
|
||||
|
||||
################################################################################
|
||||
# Shell Environment Policy for spawned processes
|
||||
################################################################################
|
||||
|
||||
[shell_environment_policy]
|
||||
# inherit: all (default) | core | none
|
||||
inherit = "all"
|
||||
# Skip default excludes for names containing KEY/TOKEN (case-insensitive). Default: false
|
||||
ignore_default_excludes = false
|
||||
# Case-insensitive glob patterns to remove (e.g., "AWS_*", "AZURE_*"). Default: []
|
||||
exclude = []
|
||||
# Explicit key/value overrides (always win). Default: {}
|
||||
set = {}
|
||||
# Whitelist; if non-empty, keep only matching vars. Default: []
|
||||
include_only = []
|
||||
# Experimental: run via user shell profile. Default: false
|
||||
experimental_use_profile = false
|
||||
|
||||
################################################################################
|
||||
# History & File Opener
|
||||
################################################################################
|
||||
|
||||
[history]
|
||||
# save-all (default) | none
|
||||
persistence = "save-all"
|
||||
# Maximum bytes for history file; oldest entries are trimmed when exceeded. Example: 5242880
|
||||
# max_bytes = 0
|
||||
|
||||
# URI scheme for clickable citations: vscode (default) | vscode-insiders | windsurf | cursor | none
|
||||
file_opener = "vscode"
|
||||
|
||||
################################################################################
|
||||
# UI, Notifications, and Misc
|
||||
################################################################################
|
||||
|
||||
[tui]
|
||||
# Desktop notifications from the TUI: boolean or filtered list. Default: true
|
||||
# Examples: false | ["agent-turn-complete", "approval-requested"]
|
||||
notifications = false
|
||||
|
||||
# Enables welcome/status/spinner animations. Default: true
|
||||
animations = true
|
||||
|
||||
# Suppress internal reasoning events from output. Default: false
|
||||
hide_agent_reasoning = false
|
||||
|
||||
# Show raw reasoning content when available. Default: false
|
||||
show_raw_agent_reasoning = false
|
||||
|
||||
# Disable burst-paste detection in the TUI. Default: false
|
||||
disable_paste_burst = false
|
||||
|
||||
# Track Windows onboarding acknowledgement (Windows only). Default: false
|
||||
windows_wsl_setup_acknowledged = false
|
||||
|
||||
# External notifier program (argv array). When unset: disabled.
|
||||
# Example: notify = ["notify-send", "Codex"]
|
||||
# notify = [ ]
|
||||
|
||||
# In-product notices (mostly set automatically by Codex).
|
||||
[notice]
|
||||
# hide_full_access_warning = true
|
||||
# hide_rate_limit_model_nudge = true
|
||||
|
||||
################################################################################
|
||||
# Authentication & Login
|
||||
################################################################################
|
||||
|
||||
# Where to persist CLI login credentials: file (default) | keyring | auto
|
||||
cli_auth_credentials_store = "file"
|
||||
|
||||
# Base URL for ChatGPT auth flow (not OpenAI API). Default:
|
||||
chatgpt_base_url = "https://chatgpt.com/backend-api/"
|
||||
|
||||
# Restrict ChatGPT login to a specific workspace id. Default: unset.
|
||||
# forced_chatgpt_workspace_id = ""
|
||||
|
||||
# Force login mechanism when Codex would normally auto-select. Default: unset.
|
||||
# Allowed values: chatgpt | api
|
||||
# forced_login_method = "chatgpt"
|
||||
|
||||
# Preferred store for MCP OAuth credentials: auto (default) | file | keyring
|
||||
mcp_oauth_credentials_store = "auto"
|
||||
|
||||
################################################################################
|
||||
# Project Documentation Controls
|
||||
################################################################################
|
||||
|
||||
# Max bytes from AGENTS.md to embed into first-turn instructions. Default: 32768
|
||||
project_doc_max_bytes = 32768
|
||||
|
||||
# Ordered fallbacks when AGENTS.md is missing at a directory level. Default: []
|
||||
project_doc_fallback_filenames = []
|
||||
|
||||
################################################################################
|
||||
# Tools (legacy toggles kept for compatibility)
|
||||
################################################################################
|
||||
|
||||
[tools]
|
||||
# Enable web search tool (alias: web_search_request). Default: false
|
||||
web_search = false
|
||||
|
||||
# Enable the view_image tool so the agent can attach local images. Default: true
|
||||
view_image = true
|
||||
|
||||
# (Alias accepted) You can also write:
|
||||
# web_search_request = false
|
||||
|
||||
################################################################################
|
||||
# Centralized Feature Flags (preferred)
|
||||
################################################################################
|
||||
|
||||
[features]
|
||||
# Leave this table empty to accept defaults. Set explicit booleans to opt in/out.
|
||||
unified_exec = false
|
||||
rmcp_client = false
|
||||
apply_patch_freeform = false
|
||||
view_image_tool = true
|
||||
web_search_request = false
|
||||
ghost_commit = false
|
||||
enable_experimental_windows_sandbox = false
|
||||
skills = false
|
||||
|
||||
################################################################################
|
||||
# Experimental toggles (legacy; prefer [features])
|
||||
################################################################################
|
||||
|
||||
# Include apply_patch via freeform editing path (affects default tool set). Default: false
|
||||
experimental_use_freeform_apply_patch = false
|
||||
|
||||
# Define MCP servers under this table. Leave empty to disable.
|
||||
[mcp_servers]
|
||||
|
||||
# --- Example: STDIO transport ---
|
||||
# [mcp_servers.docs]
|
||||
# command = "docs-server" # required
|
||||
# args = ["--port", "4000"] # optional
|
||||
# env = { "API_KEY" = "value" } # optional key/value pairs copied as-is
|
||||
# env_vars = ["ANOTHER_SECRET"] # optional: forward these from the parent env
|
||||
# cwd = "/path/to/server" # optional working directory override
|
||||
# startup_timeout_sec = 10.0 # optional; default 10.0 seconds
|
||||
# # startup_timeout_ms = 10000 # optional alias for startup timeout (milliseconds)
|
||||
# tool_timeout_sec = 60.0 # optional; default 60.0 seconds
|
||||
# enabled_tools = ["search", "summarize"] # optional allow-list
|
||||
# disabled_tools = ["slow-tool"] # optional deny-list (applied after allow-list)
|
||||
|
||||
# --- Example: Streamable HTTP transport ---
|
||||
# [mcp_servers.github]
|
||||
# url = "https://github-mcp.example.com/mcp" # required
|
||||
# bearer_token_env_var = "GITHUB_TOKEN" # optional; Authorization: Bearer <token>
|
||||
# http_headers = { "X-Example" = "value" } # optional static headers
|
||||
# env_http_headers = { "X-Auth" = "AUTH_ENV" } # optional headers populated from env vars
|
||||
# startup_timeout_sec = 10.0 # optional
|
||||
# tool_timeout_sec = 60.0 # optional
|
||||
# enabled_tools = ["list_issues"] # optional allow-list
|
||||
|
||||
################################################################################
|
||||
# Model Providers (extend/override built-ins)
|
||||
################################################################################
|
||||
|
||||
# Built-ins include:
|
||||
# - openai (Responses API; requires login or OPENAI_API_KEY via auth flow)
|
||||
# - oss (Chat Completions API; defaults to http://localhost:11434/v1)
|
||||
|
||||
[model_providers]
|
||||
|
||||
# --- Example: override OpenAI with explicit base URL or headers ---
|
||||
# [model_providers.openai]
|
||||
# name = "OpenAI"
|
||||
# base_url = "https://api.openai.com/v1" # default if unset
|
||||
# wire_api = "responses" # "responses" | "chat" (default varies)
|
||||
# # requires_openai_auth = true # built-in OpenAI defaults to true
|
||||
# # request_max_retries = 4 # default 4; max 100
|
||||
# # stream_max_retries = 5 # default 5; max 100
|
||||
# # stream_idle_timeout_ms = 300000 # default 300_000 (5m)
|
||||
# # experimental_bearer_token = "sk-example" # optional dev-only direct bearer token
|
||||
# # http_headers = { "X-Example" = "value" }
|
||||
# # env_http_headers = { "OpenAI-Organization" = "OPENAI_ORGANIZATION", "OpenAI-Project" = "OPENAI_PROJECT" }
|
||||
|
||||
# --- Example: Azure (Chat/Responses depending on endpoint) ---
|
||||
# [model_providers.azure]
|
||||
# name = "Azure"
|
||||
# base_url = "https://YOUR_PROJECT_NAME.openai.azure.com/openai"
|
||||
# wire_api = "responses" # or "chat" per endpoint
|
||||
# query_params = { api-version = "2025-04-01-preview" }
|
||||
# env_key = "AZURE_OPENAI_API_KEY"
|
||||
# # env_key_instructions = "Set AZURE_OPENAI_API_KEY in your environment"
|
||||
|
||||
# --- Example: Local OSS (e.g., Ollama-compatible) ---
|
||||
# [model_providers.ollama]
|
||||
# name = "Ollama"
|
||||
# base_url = "http://localhost:11434/v1"
|
||||
# wire_api = "chat"
|
||||
|
||||
################################################################################
|
||||
# Profiles (named presets)
|
||||
################################################################################
|
||||
|
||||
# Active profile name. When unset, no profile is applied.
|
||||
# profile = "default"
|
||||
|
||||
[profiles]
|
||||
|
||||
# [profiles.default]
|
||||
# model = "gpt-5.1-codex-max"
|
||||
# model_provider = "openai"
|
||||
# approval_policy = "on-request"
|
||||
# sandbox_mode = "read-only"
|
||||
# model_reasoning_effort = "medium"
|
||||
# model_reasoning_summary = "auto"
|
||||
# model_verbosity = "medium"
|
||||
# chatgpt_base_url = "https://chatgpt.com/backend-api/"
|
||||
# experimental_compact_prompt_file = "compact_prompt.txt"
|
||||
# include_apply_patch_tool = false
|
||||
# experimental_use_freeform_apply_patch = false
|
||||
# tools_web_search = false
|
||||
# tools_view_image = true
|
||||
# features = { unified_exec = false }
|
||||
|
||||
################################################################################
|
||||
# Projects (trust levels)
|
||||
################################################################################
|
||||
|
||||
# Mark specific worktrees as trusted. Only "trusted" is recognized.
|
||||
[projects]
|
||||
# [projects."/absolute/path/to/project"]
|
||||
# trust_level = "trusted"
|
||||
|
||||
################################################################################
|
||||
# OpenTelemetry (OTEL) – disabled by default
|
||||
################################################################################
|
||||
|
||||
[otel]
|
||||
# Include user prompt text in logs. Default: false
|
||||
log_user_prompt = false
|
||||
# Environment label applied to telemetry. Default: "dev"
|
||||
environment = "dev"
|
||||
# Exporter: none (default) | otlp-http | otlp-grpc
|
||||
exporter = "none"
|
||||
|
||||
# Example OTLP/HTTP exporter configuration
|
||||
# [otel.exporter."otlp-http"]
|
||||
# endpoint = "https://otel.example.com/v1/logs"
|
||||
# protocol = "binary" # "binary" | "json"
|
||||
|
||||
# [otel.exporter."otlp-http".headers]
|
||||
# "x-otlp-api-key" = "${OTLP_TOKEN}"
|
||||
|
||||
# Example OTLP/gRPC exporter configuration
|
||||
# [otel.exporter."otlp-grpc"]
|
||||
# endpoint = "https://otel.example.com:4317",
|
||||
# headers = { "x-otlp-meta" = "abc123" }
|
||||
|
||||
# Example OTLP exporter with mutual TLS
|
||||
# [otel.exporter."otlp-http"]
|
||||
# endpoint = "https://otel.example.com/v1/logs"
|
||||
# protocol = "binary"
|
||||
|
||||
# [otel.exporter."otlp-http".headers]
|
||||
# "x-otlp-api-key" = "${OTLP_TOKEN}"
|
||||
|
||||
# [otel.exporter."otlp-http".tls]
|
||||
# ca-certificate = "certs/otel-ca.pem"
|
||||
# client-certificate = "/etc/codex/certs/client.pem"
|
||||
# client-private-key = "/etc/codex/certs/client-key.pem"
|
||||
```
|
||||
@@ -1,187 +0,0 @@
|
||||
# Test Quality Enhancements - Implementation Summary
|
||||
|
||||
**Date**: 2025-12-16
|
||||
**Status**: ✅ Complete - All 4 recommendations implemented and passing
|
||||
|
||||
## Overview
|
||||
|
||||
Implemented all 4 test quality recommendations from Gemini's comprehensive analysis to enhance test coverage and robustness across the codex-lens test suite.
|
||||
|
||||
## Recommendation 1: Verify True Fuzzy Matching ✅
|
||||
|
||||
**File**: `tests/test_dual_fts.py`
|
||||
**Test Class**: `TestDualFTSPerformance`
|
||||
**New Test**: `test_fuzzy_substring_matching`
|
||||
|
||||
### Implementation
|
||||
- Verifies trigram tokenizer enables partial token matching
|
||||
- Tests that searching for "func" matches "function0", "function1", etc.
|
||||
- Gracefully skips if trigram tokenizer unavailable
|
||||
- Validates BM25 scoring for fuzzy results
|
||||
|
||||
### Key Features
|
||||
- Runtime detection of trigram support
|
||||
- Validates substring matching capability
|
||||
- Ensures proper score ordering (negative BM25)
|
||||
|
||||
### Test Result
|
||||
```bash
|
||||
PASSED tests/test_dual_fts.py::TestDualFTSPerformance::test_fuzzy_substring_matching
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Recommendation 2: Enable Mocked Vector Search ✅
|
||||
|
||||
**File**: `tests/test_hybrid_search_e2e.py`
|
||||
**Test Class**: `TestHybridSearchWithVectorMock`
|
||||
**New Test**: `test_hybrid_with_vector_enabled`
|
||||
|
||||
### Implementation
|
||||
- Mocks vector search to return predefined results
|
||||
- Tests RRF fusion with exact + fuzzy + vector sources
|
||||
- Validates hybrid search handles vector integration correctly
|
||||
- Uses `unittest.mock.patch` for clean mocking
|
||||
|
||||
### Key Features
|
||||
- Mock SearchResult objects with scores
|
||||
- Tests enable_vector=True parameter
|
||||
- Validates RRF fusion score calculation (positive scores)
|
||||
- Gracefully handles missing vector search module
|
||||
|
||||
### Test Result
|
||||
```bash
|
||||
PASSED tests/test_hybrid_search_e2e.py::TestHybridSearchWithVectorMock::test_hybrid_with_vector_enabled
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Recommendation 3: Complex Query Parser Stress Tests ✅
|
||||
|
||||
**File**: `tests/test_query_parser.py`
|
||||
**Test Class**: `TestComplexBooleanQueries`
|
||||
**New Tests**: 5 comprehensive tests
|
||||
|
||||
### Implementation
|
||||
|
||||
#### 1. `test_nested_boolean_and_or`
|
||||
- Tests: `(login OR logout) AND user`
|
||||
- Validates nested parentheses preservation
|
||||
- Ensures boolean operators remain intact
|
||||
|
||||
#### 2. `test_mixed_operators_with_expansion`
|
||||
- Tests: `UserAuth AND (login OR logout)`
|
||||
- Verifies CamelCase expansion doesn't break operators
|
||||
- Ensures expansion + boolean logic coexist
|
||||
|
||||
#### 3. `test_quoted_phrases_with_boolean`
|
||||
- Tests: `"user authentication" AND login`
|
||||
- Validates quoted phrase preservation
|
||||
- Ensures AND operator survives
|
||||
|
||||
#### 4. `test_not_operator_preservation`
|
||||
- Tests: `login NOT logout`
|
||||
- Confirms NOT operator handling
|
||||
- Validates negation logic
|
||||
|
||||
#### 5. `test_complex_nested_three_levels`
|
||||
- Tests: `((UserAuth OR login) AND session) OR token`
|
||||
- Stress tests deep nesting (3 levels)
|
||||
- Validates multiple parentheses pairs
|
||||
|
||||
### Test Results
|
||||
```bash
|
||||
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_nested_boolean_and_or
|
||||
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_mixed_operators_with_expansion
|
||||
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_quoted_phrases_with_boolean
|
||||
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_not_operator_preservation
|
||||
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_complex_nested_three_levels
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Recommendation 4: Migration Reversibility Tests ✅
|
||||
|
||||
**File**: `tests/test_dual_fts.py`
|
||||
**Test Class**: `TestMigrationRecovery`
|
||||
**New Tests**: 2 migration robustness tests
|
||||
|
||||
### Implementation
|
||||
|
||||
#### 1. `test_migration_preserves_data_on_failure`
|
||||
- Creates v2 database with test data
|
||||
- Attempts migration (may succeed or fail)
|
||||
- Validates data preservation in both scenarios
|
||||
- Smart column detection (path vs full_path)
|
||||
|
||||
**Key Features**:
|
||||
- Checks schema version to determine column names
|
||||
- Handles both migration success and failure
|
||||
- Ensures no data loss
|
||||
|
||||
#### 2. `test_migration_idempotent_after_partial_failure`
|
||||
- Tests retry capability after partial migration
|
||||
- Validates graceful handling of repeated initialization
|
||||
- Ensures database remains in usable state
|
||||
|
||||
**Key Features**:
|
||||
- Double initialization without errors
|
||||
- Table existence verification
|
||||
- Safe retry mechanism
|
||||
|
||||
### Test Results
|
||||
```bash
|
||||
PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_preserves_data_on_failure
|
||||
PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_idempotent_after_partial_failure
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Test Suite Statistics
|
||||
|
||||
### Overall Results
|
||||
```
|
||||
91 passed, 2 skipped, 2 warnings in 3.31s
|
||||
```
|
||||
|
||||
### New Tests Added
|
||||
- **Recommendation 1**: 1 test (fuzzy substring matching)
|
||||
- **Recommendation 2**: 1 test (vector mock integration)
|
||||
- **Recommendation 3**: 5 tests (complex boolean queries)
|
||||
- **Recommendation 4**: 2 tests (migration recovery)
|
||||
|
||||
**Total New Tests**: 9
|
||||
|
||||
### Coverage Improvements
|
||||
- **Fuzzy Search**: Now validates actual trigram substring matching
|
||||
- **Hybrid Search**: Tests vector integration with mocks
|
||||
- **Query Parser**: Handles complex nested boolean logic
|
||||
- **Migration**: Validates data preservation and retry capability
|
||||
|
||||
---
|
||||
|
||||
## Code Quality
|
||||
|
||||
### Best Practices Applied
|
||||
1. **Graceful Degradation**: Tests skip when features unavailable (trigram)
|
||||
2. **Clean Mocking**: Uses `unittest.mock` for vector search
|
||||
3. **Smart Assertions**: Adapts to migration outcomes dynamically
|
||||
4. **Edge Case Handling**: Tests multiple nesting levels and operators
|
||||
|
||||
### Integration
|
||||
- All tests integrate seamlessly with existing pytest fixtures
|
||||
- Maintains 100% pass rate across test suite
|
||||
- No breaking changes to existing tests
|
||||
|
||||
---
|
||||
|
||||
## Validation
|
||||
|
||||
All 4 recommendations successfully implemented and verified:
|
||||
|
||||
✅ **Recommendation 1**: Fuzzy substring matching with trigram validation
|
||||
✅ **Recommendation 2**: Vector search mocking for hybrid fusion testing
|
||||
✅ **Recommendation 3**: Complex boolean query stress tests (5 tests)
|
||||
✅ **Recommendation 4**: Migration recovery and idempotency tests (2 tests)
|
||||
|
||||
**Final Status**: Production-ready, all tests passing
|
||||
@@ -1,156 +0,0 @@
|
||||
"""Demo script for association tree building.
|
||||
|
||||
This script demonstrates how to use the AssociationTreeBuilder and
|
||||
ResultDeduplicator to explore code relationships via LSP call hierarchy.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.lsp.standalone_manager import StandaloneLspManager
|
||||
from codexlens.search.association_tree import (
|
||||
AssociationTreeBuilder,
|
||||
ResultDeduplicator,
|
||||
)
|
||||
|
||||
|
||||
async def demo_simple_tree():
|
||||
"""Build a simple call tree from a Python file."""
|
||||
print("=" * 70)
|
||||
print("Association Tree Demo")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Use this file as the test subject
|
||||
test_file = Path(__file__).resolve()
|
||||
workspace_root = test_file.parent.parent
|
||||
|
||||
print(f"Workspace: {workspace_root}")
|
||||
print(f"Test file: {test_file.name}")
|
||||
print()
|
||||
|
||||
# Initialize LSP manager
|
||||
async with StandaloneLspManager(
|
||||
workspace_root=str(workspace_root),
|
||||
timeout=10.0,
|
||||
) as lsp:
|
||||
print("LSP manager initialized")
|
||||
print()
|
||||
|
||||
# Create tree builder
|
||||
builder = AssociationTreeBuilder(lsp, timeout=5.0)
|
||||
|
||||
# Build tree from a function in this file
|
||||
# Using line 50 as an example (adjust based on actual file)
|
||||
print(f"Building call tree from {test_file.name}:50...")
|
||||
tree = await builder.build_tree(
|
||||
seed_file_path=str(test_file),
|
||||
seed_line=50,
|
||||
seed_character=1,
|
||||
max_depth=3,
|
||||
expand_callers=True,
|
||||
expand_callees=True,
|
||||
)
|
||||
|
||||
print(f"Tree built: {tree}")
|
||||
print(f" Roots: {len(tree.roots)}")
|
||||
print(f" Total unique nodes: {len(tree.all_nodes)}")
|
||||
print(f" Total node instances: {len(tree.node_list)}")
|
||||
print(f" Edges: {len(tree.edges)}")
|
||||
print()
|
||||
|
||||
if tree.roots:
|
||||
print("Root nodes:")
|
||||
for root in tree.roots:
|
||||
print(f" - {root.item.name} ({root.item.kind})")
|
||||
print(f" {root.item.file_path}:{root.item.range.start_line}")
|
||||
print()
|
||||
|
||||
# Deduplicate and score
|
||||
print("Deduplicating and scoring nodes...")
|
||||
deduplicator = ResultDeduplicator(
|
||||
depth_weight=0.4,
|
||||
frequency_weight=0.3,
|
||||
kind_weight=0.3,
|
||||
)
|
||||
|
||||
unique_nodes = deduplicator.deduplicate(tree, max_results=20)
|
||||
print(f"Found {len(unique_nodes)} unique nodes")
|
||||
print()
|
||||
|
||||
if unique_nodes:
|
||||
print("Top 10 nodes by score:")
|
||||
print("-" * 70)
|
||||
for i, node in enumerate(unique_nodes[:10], 1):
|
||||
print(f"{i:2}. {node.name} ({node.kind})")
|
||||
print(f" Location: {Path(node.file_path).name}:{node.range.start_line}")
|
||||
print(
|
||||
f" Depth: {node.min_depth}, "
|
||||
f"Occurrences: {node.occurrences}, "
|
||||
f"Score: {node.score:.3f}"
|
||||
)
|
||||
if node.paths:
|
||||
print(f" Paths: {len(node.paths)}")
|
||||
print()
|
||||
|
||||
# Show filtering capabilities
|
||||
functions = deduplicator.filter_by_kind(
|
||||
unique_nodes, ["function", "method"]
|
||||
)
|
||||
print(f"Functions/methods only: {len(functions)} nodes")
|
||||
|
||||
if functions:
|
||||
print("Top 5 functions:")
|
||||
for i, node in enumerate(functions[:5], 1):
|
||||
print(f" {i}. {node.name} (score: {node.score:.3f})")
|
||||
|
||||
else:
|
||||
print("No nodes found. Try a different seed location.")
|
||||
|
||||
print()
|
||||
print("Demo complete!")
|
||||
|
||||
|
||||
async def demo_cycle_detection():
|
||||
"""Demonstrate cycle detection in call trees."""
|
||||
print("\n" + "=" * 70)
|
||||
print("Cycle Detection Demo")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Create a simple Python file with circular calls for testing
|
||||
test_code = '''
|
||||
def func_a():
|
||||
"""Function A calls B."""
|
||||
func_b()
|
||||
|
||||
def func_b():
|
||||
"""Function B calls A (creates a cycle)."""
|
||||
func_a()
|
||||
'''
|
||||
|
||||
print("This demo would detect cycles in:")
|
||||
print(test_code)
|
||||
print("The tree builder automatically marks cycle nodes to prevent infinite expansion.")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the demo."""
|
||||
try:
|
||||
asyncio.run(demo_simple_tree())
|
||||
demo_cycle_detection()
|
||||
except KeyboardInterrupt:
|
||||
print("\nDemo interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\nError running demo: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,40 +0,0 @@
|
||||
"""Debug URI format issues."""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
def test_uri_formats():
|
||||
"""Compare different URI formats."""
|
||||
file_path = Path("D:/Claude_dms3/codex-lens/test_simple_function.py")
|
||||
|
||||
print("URI Format Comparison")
|
||||
print("="*80)
|
||||
|
||||
# Method 1: Path.as_uri()
|
||||
uri1 = file_path.resolve().as_uri()
|
||||
print(f"1. Path.as_uri(): {uri1}")
|
||||
|
||||
# Method 2: Manual construction
|
||||
uri2 = f"file:///{str(file_path.resolve()).replace(chr(92), '/')}"
|
||||
print(f"2. Manual (forward /): {uri2}")
|
||||
|
||||
# Method 3: With quote
|
||||
path_str = str(file_path.resolve()).replace(chr(92), '/')
|
||||
uri3 = f"file:///{quote(path_str, safe='/:')}"
|
||||
print(f"3. With quote: {uri3}")
|
||||
|
||||
# Method 4: Lowercase drive
|
||||
path_lower = str(file_path.resolve()).replace(chr(92), '/')
|
||||
if len(path_lower) > 1 and path_lower[1] == ':':
|
||||
path_lower = path_lower[0].lower() + path_lower[1:]
|
||||
uri4 = f"file:///{path_lower}"
|
||||
print(f"4. Lowercase drive: {uri4}")
|
||||
|
||||
# What Pyright shows in logs
|
||||
print(f"\n5. Pyright log format: file:///d%3A/Claude_dms3/codex-lens/...")
|
||||
|
||||
return uri1, uri4
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_uri_formats()
|
||||
@@ -1,326 +0,0 @@
|
||||
"""Search method comparison benchmark.
|
||||
|
||||
Compares different search strategies:
|
||||
1. Pure FTS (exact + fuzzy matching)
|
||||
2. Pure Vector (semantic search only)
|
||||
3. Hybrid Fusion (FTS + Vector with RRF)
|
||||
4. Vector + LSP Association Tree (new strategy)
|
||||
|
||||
Usage:
|
||||
python examples/search_comparison_benchmark.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
from codexlens.lsp.standalone_manager import StandaloneLspManager
|
||||
from codexlens.search.association_tree import AssociationTreeBuilder, ResultDeduplicator
|
||||
|
||||
|
||||
class SearchBenchmark:
|
||||
"""Benchmark different search strategies."""
|
||||
|
||||
def __init__(self, index_path: Path, config: Config):
|
||||
"""Initialize benchmark.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
config: CodexLens config
|
||||
"""
|
||||
self.index_path = index_path
|
||||
self.config = config
|
||||
self.engine = HybridSearchEngine(config=config)
|
||||
self.lsp_manager: StandaloneLspManager | None = None
|
||||
self.tree_builder: AssociationTreeBuilder | None = None
|
||||
self.deduplicator = ResultDeduplicator(
|
||||
depth_weight=0.4,
|
||||
frequency_weight=0.3,
|
||||
kind_weight=0.3,
|
||||
max_depth_penalty=10,
|
||||
)
|
||||
|
||||
async def setup_lsp(self):
|
||||
"""Setup LSP manager for association tree search."""
|
||||
self.lsp_manager = StandaloneLspManager(
|
||||
workspace_root=str(self.index_path.parent),
|
||||
timeout=5.0,
|
||||
)
|
||||
await self.lsp_manager.start()
|
||||
self.tree_builder = AssociationTreeBuilder(
|
||||
lsp_manager=self.lsp_manager,
|
||||
timeout=5.0,
|
||||
)
|
||||
|
||||
async def cleanup_lsp(self):
|
||||
"""Cleanup LSP manager."""
|
||||
if self.lsp_manager:
|
||||
await self.lsp_manager.stop()
|
||||
|
||||
def method1_pure_fts(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
||||
"""Method 1: Pure FTS (exact + fuzzy)."""
|
||||
start = time.perf_counter()
|
||||
results = self.engine.search(
|
||||
index_path=self.index_path,
|
||||
query=query,
|
||||
limit=limit,
|
||||
enable_fuzzy=True,
|
||||
enable_vector=False,
|
||||
pure_vector=False,
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
return results, elapsed
|
||||
|
||||
def method2_pure_vector(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
||||
"""Method 2: Pure Vector (semantic search only)."""
|
||||
start = time.perf_counter()
|
||||
results = self.engine.search(
|
||||
index_path=self.index_path,
|
||||
query=query,
|
||||
limit=limit,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=True,
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
return results, elapsed
|
||||
|
||||
def method3_hybrid_fusion(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
||||
"""Method 3: Hybrid Fusion (FTS + Vector with RRF)."""
|
||||
start = time.perf_counter()
|
||||
results = self.engine.search(
|
||||
index_path=self.index_path,
|
||||
query=query,
|
||||
limit=limit,
|
||||
enable_fuzzy=True,
|
||||
enable_vector=True,
|
||||
pure_vector=False,
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
return results, elapsed
|
||||
|
||||
async def method4_vector_lsp_tree(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 20,
|
||||
max_depth: int = 3,
|
||||
expand_callers: bool = True,
|
||||
expand_callees: bool = True,
|
||||
) -> tuple[List[SearchResult], float, Dict[str, Any]]:
|
||||
"""Method 4: Vector + LSP Association Tree (new strategy).
|
||||
|
||||
Steps:
|
||||
1. Vector search to find seed results (top 5-10)
|
||||
2. For each seed, build LSP association tree
|
||||
3. Deduplicate and score all discovered nodes
|
||||
4. Return top N results
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Final result limit
|
||||
max_depth: Maximum depth for LSP tree expansion
|
||||
expand_callers: Whether to expand incoming calls
|
||||
expand_callees: Whether to expand outgoing calls
|
||||
|
||||
Returns:
|
||||
Tuple of (results, elapsed_time, stats)
|
||||
"""
|
||||
if not self.tree_builder:
|
||||
raise RuntimeError("LSP not initialized. Call setup_lsp() first.")
|
||||
|
||||
start = time.perf_counter()
|
||||
stats = {
|
||||
"seed_count": 0,
|
||||
"trees_built": 0,
|
||||
"total_tree_nodes": 0,
|
||||
"unique_nodes": 0,
|
||||
"dedup_time_ms": 0,
|
||||
}
|
||||
|
||||
# Step 1: Get seed results from vector search (top 10)
|
||||
seed_results = self.engine.search(
|
||||
index_path=self.index_path,
|
||||
query=query,
|
||||
limit=10,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=True,
|
||||
)
|
||||
stats["seed_count"] = len(seed_results)
|
||||
|
||||
if not seed_results:
|
||||
return [], time.perf_counter() - start, stats
|
||||
|
||||
# Step 2: Build association trees for each seed
|
||||
all_trees = []
|
||||
for seed in seed_results:
|
||||
try:
|
||||
tree = await self.tree_builder.build_tree(
|
||||
seed_file_path=seed.path,
|
||||
seed_line=seed.start_line or 1,
|
||||
seed_character=1,
|
||||
max_depth=max_depth,
|
||||
expand_callers=expand_callers,
|
||||
expand_callees=expand_callees,
|
||||
)
|
||||
if tree.node_list:
|
||||
all_trees.append(tree)
|
||||
stats["trees_built"] += 1
|
||||
stats["total_tree_nodes"] += len(tree.node_list)
|
||||
except Exception as e:
|
||||
print(f"Error building tree for {seed.path}:{seed.start_line}: {e}")
|
||||
continue
|
||||
|
||||
if not all_trees:
|
||||
# Fallback to seed results if no trees built
|
||||
return seed_results[:limit], time.perf_counter() - start, stats
|
||||
|
||||
# Step 3: Merge and deduplicate all trees
|
||||
dedup_start = time.perf_counter()
|
||||
|
||||
# Merge all node_lists into a single CallTree
|
||||
from codexlens.search.association_tree.data_structures import CallTree
|
||||
merged_tree = CallTree()
|
||||
for tree in all_trees:
|
||||
merged_tree.node_list.extend(tree.node_list)
|
||||
|
||||
# Deduplicate
|
||||
unique_nodes = self.deduplicator.deduplicate(
|
||||
tree=merged_tree,
|
||||
max_results=limit,
|
||||
)
|
||||
stats["unique_nodes"] = len(unique_nodes)
|
||||
stats["dedup_time_ms"] = (time.perf_counter() - dedup_start) * 1000
|
||||
|
||||
# Step 4: Convert UniqueNode to SearchResult
|
||||
results = []
|
||||
for node in unique_nodes:
|
||||
# Use node.score as the search score
|
||||
result = SearchResult(
|
||||
path=node.file_path,
|
||||
score=node.score,
|
||||
start_line=node.range.start_line,
|
||||
end_line=node.range.end_line,
|
||||
symbol_name=node.name,
|
||||
symbol_kind=node.kind,
|
||||
content="", # LSP doesn't provide content
|
||||
metadata={"search_source": "lsp_tree"},
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
elapsed = time.perf_counter() - start
|
||||
return results, elapsed, stats
|
||||
|
||||
def print_results(self, method_name: str, results: List[SearchResult], elapsed: float, stats: Dict[str, Any] | None = None):
|
||||
"""Print benchmark results."""
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Method: {method_name}")
|
||||
print(f"{'='*80}")
|
||||
print(f"Time: {elapsed*1000:.2f}ms")
|
||||
print(f"Results: {len(results)}")
|
||||
|
||||
if stats:
|
||||
print(f"\nStats:")
|
||||
for key, value in stats.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(f"\nTop 5 Results:")
|
||||
for i, result in enumerate(results[:5], 1):
|
||||
print(f"{i}. [{result.score:.4f}] {result.path}:{result.start_line}")
|
||||
if result.symbol_name:
|
||||
print(f" Name: {result.symbol_name}")
|
||||
if result.metadata.get("search_source"):
|
||||
print(f" Source: {result.metadata.get('search_source')}")
|
||||
|
||||
async def run_comparison(self, query: str, limit: int = 20):
|
||||
"""Run comparison for a single query."""
|
||||
print(f"\n{'#'*80}")
|
||||
print(f"Query: {query}")
|
||||
print(f"{'#'*80}")
|
||||
|
||||
# Method 1: Pure FTS
|
||||
results1, time1 = self.method1_pure_fts(query, limit)
|
||||
self.print_results("Method 1: Pure FTS", results1, time1)
|
||||
|
||||
# Method 2: Pure Vector
|
||||
results2, time2 = self.method2_pure_vector(query, limit)
|
||||
self.print_results("Method 2: Pure Vector", results2, time2)
|
||||
|
||||
# Method 3: Hybrid Fusion
|
||||
results3, time3 = self.method3_hybrid_fusion(query, limit)
|
||||
self.print_results("Method 3: Hybrid Fusion (FTS+Vector)", results3, time3)
|
||||
|
||||
# Method 4: Vector + LSP Tree (requires LSP setup)
|
||||
results4 = None
|
||||
time4 = 0.0
|
||||
try:
|
||||
results4, time4, stats4 = await self.method4_vector_lsp_tree(query, limit, max_depth=3)
|
||||
self.print_results("Method 4: Vector + LSP Association Tree", results4, time4, stats4)
|
||||
except Exception as e:
|
||||
print(f"\nMethod 4: Vector + LSP Association Tree")
|
||||
print(f"Error: {e}")
|
||||
|
||||
# Comparison summary
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Summary")
|
||||
print(f"{'='*80}")
|
||||
print(f"Method 1 (FTS): {time1*1000:8.2f}ms {len(results1):3d} results")
|
||||
print(f"Method 2 (Vector): {time2*1000:8.2f}ms {len(results2):3d} results")
|
||||
print(f"Method 3 (Hybrid): {time3*1000:8.2f}ms {len(results3):3d} results")
|
||||
if results4 is not None:
|
||||
print(f"Method 4 (Vector+LSP): {time4*1000:8.2f}ms {len(results4):3d} results")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main benchmark entry point."""
|
||||
# Setup - use the actual index path from ~/.codexlens/indexes/
|
||||
import os
|
||||
codexlens_home = Path(os.path.expanduser("~/.codexlens"))
|
||||
index_path = codexlens_home / "indexes/D/Claude_dms3/codex-lens/src/codexlens/_index.db"
|
||||
|
||||
if not index_path.exists():
|
||||
print(f"Error: Index not found at {index_path}")
|
||||
print("Please run: python -m codexlens index init src")
|
||||
return
|
||||
|
||||
project_root = Path("D:/Claude_dms3/codex-lens/src")
|
||||
|
||||
config = Config()
|
||||
benchmark = SearchBenchmark(index_path, config)
|
||||
|
||||
# Test queries
|
||||
queries = [
|
||||
"vector search implementation",
|
||||
"LSP call hierarchy",
|
||||
"search result ranking",
|
||||
"index building",
|
||||
]
|
||||
|
||||
# Setup LSP for Method 4
|
||||
print("Setting up LSP manager...")
|
||||
try:
|
||||
await benchmark.setup_lsp()
|
||||
print("LSP manager ready")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not setup LSP: {e}")
|
||||
print("Method 4 will be skipped")
|
||||
|
||||
try:
|
||||
# Run benchmarks
|
||||
for query in queries:
|
||||
await benchmark.run_comparison(query, limit=20)
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
await benchmark.cleanup_lsp()
|
||||
print("\nBenchmark complete")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,123 +0,0 @@
|
||||
"""Simple search method comparison using CLI commands.
|
||||
|
||||
Compares:
|
||||
1. FTS (Full-Text Search)
|
||||
2. Semantic (Dense + Rerank)
|
||||
3. Hybrid (Future: FTS + Semantic fusion)
|
||||
|
||||
Usage:
|
||||
python examples/simple_search_comparison.py
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def strip_ansi(text: str) -> str:
|
||||
"""Remove ANSI color codes from text."""
|
||||
ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
|
||||
return ansi_escape.sub('', text)
|
||||
|
||||
def run_search(query: str, method: str, limit: int = 20) -> tuple[list, float]:
|
||||
"""Run search via CLI and measure time."""
|
||||
cmd = [
|
||||
"python", "-m", "codexlens", "search",
|
||||
query,
|
||||
"--method", method,
|
||||
"--limit", str(limit),
|
||||
"--json",
|
||||
"-p", "."
|
||||
]
|
||||
|
||||
start = time.perf_counter()
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=str(Path("D:/Claude_dms3/codex-lens/src")),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env={**os.environ, "NO_COLOR": "1"}, # Try to disable colors
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error running {method} search:")
|
||||
print(result.stderr[:200])
|
||||
return [], elapsed
|
||||
|
||||
try:
|
||||
# Strip ANSI codes and parse JSON
|
||||
clean_output = strip_ansi(result.stdout)
|
||||
data = json.loads(clean_output)
|
||||
# Results are nested in "result" object
|
||||
if "result" in data and "results" in data["result"]:
|
||||
return data["result"]["results"], elapsed
|
||||
return data.get("results", []), elapsed
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Failed to parse JSON output for {method}: {e}")
|
||||
return [], elapsed
|
||||
|
||||
|
||||
def print_comparison(query: str):
|
||||
"""Print comparison for a single query."""
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Query: {query}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Method 1: FTS
|
||||
print("Method 1: FTS (Full-Text Search)")
|
||||
results_fts, time_fts = run_search(query, "fts", 20)
|
||||
print(f" Time: {time_fts*1000:.2f}ms")
|
||||
print(f" Results: {len(results_fts)}")
|
||||
if results_fts:
|
||||
print(f" Top 3:")
|
||||
for i, r in enumerate(results_fts[:3], 1):
|
||||
path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "")
|
||||
score = r.get("score", 0)
|
||||
print(f" {i}. [{score:.4f}] {path}")
|
||||
print()
|
||||
|
||||
# Method 2: Semantic (Dense + Rerank)
|
||||
print("Method 2: Semantic (Dense + Rerank)")
|
||||
results_semantic, time_semantic = run_search(query, "dense_rerank", 20)
|
||||
print(f" Time: {time_semantic*1000:.2f}ms")
|
||||
print(f" Results: {len(results_semantic)}")
|
||||
if results_semantic:
|
||||
print(f" Top 3:")
|
||||
for i, r in enumerate(results_semantic[:3], 1):
|
||||
path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "")
|
||||
score = r.get("score", 0)
|
||||
print(f" {i}. [{score:.4f}] {path}")
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print(f"Summary:")
|
||||
print(f" FTS: {time_fts*1000:8.2f}ms {len(results_fts):3d} results")
|
||||
print(f" Semantic: {time_semantic*1000:8.2f}ms {len(results_semantic):3d} results")
|
||||
print(f" Speedup: {time_semantic/time_fts:6.2f}x (FTS faster)")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main comparison entry point."""
|
||||
queries = [
|
||||
"vector search",
|
||||
"LSP call hierarchy",
|
||||
"search ranking",
|
||||
"index building",
|
||||
]
|
||||
|
||||
print("Search Method Comparison")
|
||||
print("=" * 80)
|
||||
|
||||
for query in queries:
|
||||
print_comparison(query)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Comparison complete")
|
||||
print(f"{'='*80}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,79 +0,0 @@
|
||||
"""Test LSP server capabilities."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from codexlens.lsp.standalone_manager import StandaloneLspManager
|
||||
|
||||
async def test_capabilities():
|
||||
"""Test what capabilities Pyright provides."""
|
||||
|
||||
workspace_root = Path("D:/Claude_dms3/codex-lens/src")
|
||||
|
||||
print("Testing LSP Capabilities")
|
||||
print("="*80)
|
||||
|
||||
# Create LSP manager
|
||||
manager = StandaloneLspManager(
|
||||
workspace_root=str(workspace_root),
|
||||
timeout=10.0,
|
||||
)
|
||||
|
||||
try:
|
||||
# Start LSP manager
|
||||
print("\n1. Starting LSP manager...")
|
||||
await manager.start()
|
||||
print(" [OK] LSP manager started")
|
||||
|
||||
# Get server state for Python
|
||||
print("\n2. Getting Python server state...")
|
||||
test_file = str(workspace_root / "codexlens/search/hybrid_search.py")
|
||||
state = await manager._get_server(test_file)
|
||||
|
||||
if not state:
|
||||
print(" [ERROR] Could not get server state!")
|
||||
return
|
||||
|
||||
print(f" [OK] Server state obtained")
|
||||
print(f" Initialized: {state.initialized}")
|
||||
|
||||
# Print capabilities
|
||||
print("\n3. Server Capabilities:")
|
||||
print("-"*80)
|
||||
caps = state.capabilities
|
||||
|
||||
# Key capabilities to check
|
||||
important_caps = [
|
||||
"callHierarchyProvider",
|
||||
"definitionProvider",
|
||||
"referencesProvider",
|
||||
"documentSymbolProvider",
|
||||
"workspaceSymbolProvider",
|
||||
"hoverProvider",
|
||||
"completionProvider",
|
||||
"signatureHelpProvider",
|
||||
]
|
||||
|
||||
for cap in important_caps:
|
||||
value = caps.get(cap)
|
||||
status = "[YES]" if value else "[NO]"
|
||||
print(f" {status} {cap}: {value}")
|
||||
|
||||
# Print all capabilities as JSON for reference
|
||||
print("\n4. Full capabilities (formatted):")
|
||||
print("-"*80)
|
||||
print(json.dumps(caps, indent=2))
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
print("\n5. Cleaning up...")
|
||||
await manager.stop()
|
||||
print(" [OK] LSP manager stopped")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_capabilities())
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user