From 113d0bd234fb29e1a60b8462fae7ab99661ae32e Mon Sep 17 00:00:00 2001 From: catlog22 Date: Wed, 18 Feb 2026 22:38:27 +0800 Subject: [PATCH] chore: remove Python build artifacts from git tracking - Remove codex-lens/build/ from version control - Update .gitignore to exclude build directories - Add ccw/.tmp-ccw-auth-home/ to gitignore (runtime temp) --- .gitignore | 7 + codex-lens/build/lib/codexlens/__init__.py | 28 - codex-lens/build/lib/codexlens/__main__.py | 14 - .../build/lib/codexlens/api/__init__.py | 88 - .../build/lib/codexlens/api/definition.py | 126 - .../build/lib/codexlens/api/file_context.py | 271 - codex-lens/build/lib/codexlens/api/hover.py | 148 - codex-lens/build/lib/codexlens/api/models.py | 281 -- .../build/lib/codexlens/api/references.py | 345 -- .../build/lib/codexlens/api/semantic.py | 471 -- codex-lens/build/lib/codexlens/api/symbols.py | 146 - codex-lens/build/lib/codexlens/api/utils.py | 153 - .../build/lib/codexlens/cli/__init__.py | 27 - .../build/lib/codexlens/cli/commands.py | 4494 ----------------- .../lib/codexlens/cli/embedding_manager.py | 2001 -------- .../build/lib/codexlens/cli/model_manager.py | 1026 ---- codex-lens/build/lib/codexlens/cli/output.py | 135 - codex-lens/build/lib/codexlens/config.py | 692 --- codex-lens/build/lib/codexlens/entities.py | 128 - codex-lens/build/lib/codexlens/env_config.py | 304 -- codex-lens/build/lib/codexlens/errors.py | 59 - .../lib/codexlens/hybrid_search/__init__.py | 28 - .../hybrid_search/data_structures.py | 602 --- .../build/lib/codexlens/indexing/__init__.py | 26 - .../build/lib/codexlens/indexing/embedding.py | 582 --- .../codexlens/indexing/symbol_extractor.py | 277 - .../build/lib/codexlens/lsp/__init__.py | 34 - .../build/lib/codexlens/lsp/handlers.py | 551 -- .../build/lib/codexlens/lsp/lsp_bridge.py | 834 --- .../lib/codexlens/lsp/lsp_graph_builder.py | 375 -- .../build/lib/codexlens/lsp/providers.py | 177 - codex-lens/build/lib/codexlens/lsp/server.py | 263 - .../lib/codexlens/lsp/standalone_manager.py | 1159 ----- .../build/lib/codexlens/mcp/__init__.py | 20 - codex-lens/build/lib/codexlens/mcp/hooks.py | 170 - .../build/lib/codexlens/mcp/provider.py | 202 - codex-lens/build/lib/codexlens/mcp/schema.py | 113 - .../build/lib/codexlens/parsers/__init__.py | 8 - .../build/lib/codexlens/parsers/encoding.py | 202 - .../build/lib/codexlens/parsers/factory.py | 385 -- .../build/lib/codexlens/parsers/tokenizer.py | 98 - .../codexlens/parsers/treesitter_parser.py | 809 --- .../build/lib/codexlens/search/__init__.py | 53 - .../search/association_tree/__init__.py | 21 - .../search/association_tree/builder.py | 450 -- .../association_tree/data_structures.py | 191 - .../search/association_tree/deduplicator.py | 301 -- .../lib/codexlens/search/binary_searcher.py | 277 - .../lib/codexlens/search/chain_search.py | 3268 ------------ .../codexlens/search/clustering/__init__.py | 124 - .../lib/codexlens/search/clustering/base.py | 153 - .../search/clustering/dbscan_strategy.py | 197 - .../codexlens/search/clustering/factory.py | 202 - .../search/clustering/frequency_strategy.py | 263 - .../search/clustering/hdbscan_strategy.py | 153 - .../search/clustering/noop_strategy.py | 83 - .../build/lib/codexlens/search/enrichment.py | 171 - .../lib/codexlens/search/graph_expander.py | 264 - .../lib/codexlens/search/hybrid_search.py | 1409 ------ .../lib/codexlens/search/query_parser.py | 242 - .../build/lib/codexlens/search/ranking.py | 942 ---- .../build/lib/codexlens/semantic/__init__.py | 118 - .../build/lib/codexlens/semantic/ann_index.py | 1068 ---- .../build/lib/codexlens/semantic/base.py | 61 - .../build/lib/codexlens/semantic/chunker.py | 821 --- .../lib/codexlens/semantic/code_extractor.py | 274 - .../build/lib/codexlens/semantic/embedder.py | 288 -- .../build/lib/codexlens/semantic/factory.py | 158 - .../lib/codexlens/semantic/gpu_support.py | 431 -- .../codexlens/semantic/litellm_embedder.py | 144 - .../codexlens/semantic/reranker/__init__.py | 25 - .../semantic/reranker/api_reranker.py | 403 -- .../lib/codexlens/semantic/reranker/base.py | 46 - .../codexlens/semantic/reranker/factory.py | 159 - .../semantic/reranker/fastembed_reranker.py | 257 - .../lib/codexlens/semantic/reranker/legacy.py | 91 - .../semantic/reranker/litellm_reranker.py | 214 - .../semantic/reranker/onnx_reranker.py | 268 - .../codexlens/semantic/rotational_embedder.py | 434 -- .../lib/codexlens/semantic/splade_encoder.py | 567 --- .../lib/codexlens/semantic/vector_store.py | 1278 ----- .../build/lib/codexlens/storage/__init__.py | 32 - .../build/lib/codexlens/storage/dir_index.py | 2358 --------- .../build/lib/codexlens/storage/file_cache.py | 32 - .../lib/codexlens/storage/global_index.py | 398 -- .../build/lib/codexlens/storage/index_tree.py | 1064 ---- .../lib/codexlens/storage/merkle_tree.py | 136 - .../codexlens/storage/migration_manager.py | 154 - .../codexlens/storage/migrations/__init__.py | 1 - .../migration_001_normalize_keywords.py | 123 - .../migration_002_add_token_metadata.py | 48 - .../migrations/migration_004_dual_fts.py | 232 - .../migration_005_cleanup_unused_fields.py | 196 - .../migration_006_enhance_relationships.py | 37 - .../migration_007_add_graph_neighbors.py | 47 - .../migration_008_add_merkle_hashes.py | 81 - .../migrations/migration_009_add_splade.py | 103 - .../migration_010_add_multi_vector_chunks.py | 162 - .../lib/codexlens/storage/path_mapper.py | 300 -- .../build/lib/codexlens/storage/registry.py | 683 --- .../lib/codexlens/storage/splade_index.py | 578 --- .../lib/codexlens/storage/sqlite_store.py | 976 ---- .../lib/codexlens/storage/sqlite_utils.py | 64 - .../codexlens/storage/vector_meta_store.py | 415 -- .../build/lib/codexlens/watcher/__init__.py | 17 - .../build/lib/codexlens/watcher/events.py | 82 - .../lib/codexlens/watcher/file_watcher.py | 347 -- .../codexlens/watcher/incremental_indexer.py | 369 -- .../build/lib/codexlens/watcher/manager.py | 255 - 109 files changed, 7 insertions(+), 43011 deletions(-) delete mode 100644 codex-lens/build/lib/codexlens/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/__main__.py delete mode 100644 codex-lens/build/lib/codexlens/api/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/api/definition.py delete mode 100644 codex-lens/build/lib/codexlens/api/file_context.py delete mode 100644 codex-lens/build/lib/codexlens/api/hover.py delete mode 100644 codex-lens/build/lib/codexlens/api/models.py delete mode 100644 codex-lens/build/lib/codexlens/api/references.py delete mode 100644 codex-lens/build/lib/codexlens/api/semantic.py delete mode 100644 codex-lens/build/lib/codexlens/api/symbols.py delete mode 100644 codex-lens/build/lib/codexlens/api/utils.py delete mode 100644 codex-lens/build/lib/codexlens/cli/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/cli/commands.py delete mode 100644 codex-lens/build/lib/codexlens/cli/embedding_manager.py delete mode 100644 codex-lens/build/lib/codexlens/cli/model_manager.py delete mode 100644 codex-lens/build/lib/codexlens/cli/output.py delete mode 100644 codex-lens/build/lib/codexlens/config.py delete mode 100644 codex-lens/build/lib/codexlens/entities.py delete mode 100644 codex-lens/build/lib/codexlens/env_config.py delete mode 100644 codex-lens/build/lib/codexlens/errors.py delete mode 100644 codex-lens/build/lib/codexlens/hybrid_search/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/hybrid_search/data_structures.py delete mode 100644 codex-lens/build/lib/codexlens/indexing/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/indexing/embedding.py delete mode 100644 codex-lens/build/lib/codexlens/indexing/symbol_extractor.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/handlers.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/lsp_bridge.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/lsp_graph_builder.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/providers.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/server.py delete mode 100644 codex-lens/build/lib/codexlens/lsp/standalone_manager.py delete mode 100644 codex-lens/build/lib/codexlens/mcp/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/mcp/hooks.py delete mode 100644 codex-lens/build/lib/codexlens/mcp/provider.py delete mode 100644 codex-lens/build/lib/codexlens/mcp/schema.py delete mode 100644 codex-lens/build/lib/codexlens/parsers/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/parsers/encoding.py delete mode 100644 codex-lens/build/lib/codexlens/parsers/factory.py delete mode 100644 codex-lens/build/lib/codexlens/parsers/tokenizer.py delete mode 100644 codex-lens/build/lib/codexlens/parsers/treesitter_parser.py delete mode 100644 codex-lens/build/lib/codexlens/search/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/search/association_tree/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/search/association_tree/builder.py delete mode 100644 codex-lens/build/lib/codexlens/search/association_tree/data_structures.py delete mode 100644 codex-lens/build/lib/codexlens/search/association_tree/deduplicator.py delete mode 100644 codex-lens/build/lib/codexlens/search/binary_searcher.py delete mode 100644 codex-lens/build/lib/codexlens/search/chain_search.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/base.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/factory.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py delete mode 100644 codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py delete mode 100644 codex-lens/build/lib/codexlens/search/enrichment.py delete mode 100644 codex-lens/build/lib/codexlens/search/graph_expander.py delete mode 100644 codex-lens/build/lib/codexlens/search/hybrid_search.py delete mode 100644 codex-lens/build/lib/codexlens/search/query_parser.py delete mode 100644 codex-lens/build/lib/codexlens/search/ranking.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/ann_index.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/base.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/chunker.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/code_extractor.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/embedder.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/factory.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/gpu_support.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/litellm_embedder.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/api_reranker.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/base.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/factory.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/fastembed_reranker.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/legacy.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/litellm_reranker.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/reranker/onnx_reranker.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/rotational_embedder.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/splade_encoder.py delete mode 100644 codex-lens/build/lib/codexlens/semantic/vector_store.py delete mode 100644 codex-lens/build/lib/codexlens/storage/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/storage/dir_index.py delete mode 100644 codex-lens/build/lib/codexlens/storage/file_cache.py delete mode 100644 codex-lens/build/lib/codexlens/storage/global_index.py delete mode 100644 codex-lens/build/lib/codexlens/storage/index_tree.py delete mode 100644 codex-lens/build/lib/codexlens/storage/merkle_tree.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migration_manager.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_001_normalize_keywords.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_002_add_token_metadata.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_004_dual_fts.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_006_enhance_relationships.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_007_add_graph_neighbors.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_008_add_merkle_hashes.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_009_add_splade.py delete mode 100644 codex-lens/build/lib/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py delete mode 100644 codex-lens/build/lib/codexlens/storage/path_mapper.py delete mode 100644 codex-lens/build/lib/codexlens/storage/registry.py delete mode 100644 codex-lens/build/lib/codexlens/storage/splade_index.py delete mode 100644 codex-lens/build/lib/codexlens/storage/sqlite_store.py delete mode 100644 codex-lens/build/lib/codexlens/storage/sqlite_utils.py delete mode 100644 codex-lens/build/lib/codexlens/storage/vector_meta_store.py delete mode 100644 codex-lens/build/lib/codexlens/watcher/__init__.py delete mode 100644 codex-lens/build/lib/codexlens/watcher/events.py delete mode 100644 codex-lens/build/lib/codexlens/watcher/file_watcher.py delete mode 100644 codex-lens/build/lib/codexlens/watcher/incremental_indexer.py delete mode 100644 codex-lens/build/lib/codexlens/watcher/manager.py diff --git a/.gitignore b/.gitignore index 9e14f8e5..b20002d0 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,13 @@ COMMAND_TEMPLATE_EXECUTOR.md COMMAND_TEMPLATE_ORCHESTRATOR.md *.pyc .codexlens/ + +# Python build artifacts +codex-lens/build/ +ccw-litellm/build/ + +# CCW runtime temp directories +ccw/.tmp-ccw-auth-home/ /settings.json *.mcp.json .mcp.json diff --git a/codex-lens/build/lib/codexlens/__init__.py b/codex-lens/build/lib/codexlens/__init__.py deleted file mode 100644 index 56f2e508..00000000 --- a/codex-lens/build/lib/codexlens/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -"""CodexLens package.""" - -from __future__ import annotations - -from . import config, entities, errors -from .config import Config -from .entities import IndexedFile, SearchResult, SemanticChunk, Symbol -from .errors import CodexLensError, ConfigError, ParseError, SearchError, StorageError - -__version__ = "0.1.0" - -__all__ = [ - "__version__", - "config", - "entities", - "errors", - "Config", - "IndexedFile", - "SearchResult", - "SemanticChunk", - "Symbol", - "CodexLensError", - "ConfigError", - "ParseError", - "StorageError", - "SearchError", -] - diff --git a/codex-lens/build/lib/codexlens/__main__.py b/codex-lens/build/lib/codexlens/__main__.py deleted file mode 100644 index 35190f97..00000000 --- a/codex-lens/build/lib/codexlens/__main__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Module entrypoint for `python -m codexlens`.""" - -from __future__ import annotations - -from codexlens.cli import app - - -def main() -> None: - app() - - -if __name__ == "__main__": - main() - diff --git a/codex-lens/build/lib/codexlens/api/__init__.py b/codex-lens/build/lib/codexlens/api/__init__.py deleted file mode 100644 index 6312ece5..00000000 --- a/codex-lens/build/lib/codexlens/api/__init__.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Codexlens Public API Layer. - -This module exports all public API functions and dataclasses for the -codexlens LSP-like functionality. - -Dataclasses (from models.py): - - CallInfo: Call relationship information - - MethodContext: Method context with call relationships - - FileContextResult: File context result with method summaries - - DefinitionResult: Definition lookup result - - ReferenceResult: Reference lookup result - - GroupedReferences: References grouped by definition - - SymbolInfo: Symbol information for workspace search - - HoverInfo: Hover information for a symbol - - SemanticResult: Semantic search result - -Utility functions (from utils.py): - - resolve_project: Resolve and validate project root path - - normalize_relationship_type: Normalize relationship type to canonical form - - rank_by_proximity: Rank results by file path proximity - -Example: - >>> from codexlens.api import ( - ... DefinitionResult, - ... resolve_project, - ... normalize_relationship_type - ... ) - >>> project = resolve_project("/path/to/project") - >>> rel_type = normalize_relationship_type("calls") - >>> print(rel_type) - 'call' -""" - -from __future__ import annotations - -# Dataclasses -from .models import ( - CallInfo, - MethodContext, - FileContextResult, - DefinitionResult, - ReferenceResult, - GroupedReferences, - SymbolInfo, - HoverInfo, - SemanticResult, -) - -# Utility functions -from .utils import ( - resolve_project, - normalize_relationship_type, - rank_by_proximity, - rank_by_score, -) - -# API functions -from .definition import find_definition -from .symbols import workspace_symbols -from .hover import get_hover -from .file_context import file_context -from .references import find_references -from .semantic import semantic_search - -__all__ = [ - # Dataclasses - "CallInfo", - "MethodContext", - "FileContextResult", - "DefinitionResult", - "ReferenceResult", - "GroupedReferences", - "SymbolInfo", - "HoverInfo", - "SemanticResult", - # Utility functions - "resolve_project", - "normalize_relationship_type", - "rank_by_proximity", - "rank_by_score", - # API functions - "find_definition", - "workspace_symbols", - "get_hover", - "file_context", - "find_references", - "semantic_search", -] diff --git a/codex-lens/build/lib/codexlens/api/definition.py b/codex-lens/build/lib/codexlens/api/definition.py deleted file mode 100644 index ecfe874b..00000000 --- a/codex-lens/build/lib/codexlens/api/definition.py +++ /dev/null @@ -1,126 +0,0 @@ -"""find_definition API implementation. - -This module provides the find_definition() function for looking up -symbol definitions with a 3-stage fallback strategy. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import DefinitionResult -from .utils import resolve_project, rank_by_proximity - -logger = logging.getLogger(__name__) - - -def find_definition( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - file_context: Optional[str] = None, - limit: int = 10 -) -> List[DefinitionResult]: - """Find definition locations for a symbol. - - Uses a 3-stage fallback strategy: - 1. Exact match with kind filter - 2. Exact match without kind filter - 3. Prefix match - - Args: - project_root: Project root directory (for index location) - symbol_name: Name of the symbol to find - symbol_kind: Optional symbol kind filter (class, function, etc.) - file_context: Optional file path for proximity ranking - limit: Maximum number of results to return - - Returns: - List of DefinitionResult sorted by proximity if file_context provided - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Stage 1: Exact match with kind filter - results = _search_with_kind(global_index, symbol_name, symbol_kind, limit) - if results: - logger.debug(f"Stage 1 (exact+kind): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - # Stage 2: Exact match without kind (if kind was specified) - if symbol_kind: - results = _search_with_kind(global_index, symbol_name, None, limit) - if results: - logger.debug(f"Stage 2 (exact): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - # Stage 3: Prefix match - results = global_index.search( - name=symbol_name, - kind=None, - limit=limit, - prefix_mode=True - ) - if results: - logger.debug(f"Stage 3 (prefix): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - logger.debug(f"No definitions found for {symbol_name}") - return [] - - -def _search_with_kind( - global_index: GlobalSymbolIndex, - symbol_name: str, - symbol_kind: Optional[str], - limit: int -) -> List[Symbol]: - """Search for symbols with optional kind filter.""" - return global_index.search( - name=symbol_name, - kind=symbol_kind, - limit=limit, - prefix_mode=False - ) - - -def _rank_and_convert( - symbols: List[Symbol], - file_context: Optional[str] -) -> List[DefinitionResult]: - """Convert symbols to DefinitionResult and rank by proximity.""" - results = [ - DefinitionResult( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - end_line=sym.range[1] if sym.range else 1, - signature=None, # Could extract from file if needed - container=None, # Could extract from parent symbol - score=1.0 - ) - for sym in symbols - ] - return rank_by_proximity(results, file_context) diff --git a/codex-lens/build/lib/codexlens/api/file_context.py b/codex-lens/build/lib/codexlens/api/file_context.py deleted file mode 100644 index 6e1f9408..00000000 --- a/codex-lens/build/lib/codexlens/api/file_context.py +++ /dev/null @@ -1,271 +0,0 @@ -"""file_context API implementation. - -This module provides the file_context() function for retrieving -method call graphs from a source file. -""" - -from __future__ import annotations - -import logging -import os -from pathlib import Path -from typing import List, Optional, Tuple - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.dir_index import DirIndexStore -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import ( - FileContextResult, - MethodContext, - CallInfo, -) -from .utils import resolve_project, normalize_relationship_type - -logger = logging.getLogger(__name__) - - -def file_context( - project_root: str, - file_path: str, - include_calls: bool = True, - include_callers: bool = True, - max_depth: int = 1, - format: str = "brief" -) -> FileContextResult: - """Get method call context for a code file. - - Retrieves all methods/functions in the file along with their - outgoing calls and incoming callers. - - Args: - project_root: Project root directory (for index location) - file_path: Path to the code file to analyze - include_calls: Whether to include outgoing calls - include_callers: Whether to include incoming callers - max_depth: Call chain depth (V1 only supports 1) - format: Output format (brief | detailed | tree) - - Returns: - FileContextResult with method contexts and summary - - Raises: - IndexNotFoundError: If project is not indexed - FileNotFoundError: If file does not exist - ValueError: If max_depth > 1 (V1 limitation) - """ - # V1 limitation: only depth=1 supported - if max_depth > 1: - raise ValueError( - f"max_depth > 1 not supported in V1. " - f"Requested: {max_depth}, supported: 1" - ) - - project_path = resolve_project(project_root) - file_path_resolved = Path(file_path).resolve() - - # Validate file exists - if not file_path_resolved.exists(): - raise FileNotFoundError(f"File not found: {file_path_resolved}") - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Get all symbols in the file - symbols = global_index.get_file_symbols(str(file_path_resolved)) - - # Filter to functions, methods, and classes - method_symbols = [ - s for s in symbols - if s.kind in ("function", "method", "class") - ] - - logger.debug(f"Found {len(method_symbols)} methods in {file_path}") - - # Try to find dir_index for relationship queries - dir_index = _find_dir_index(project_info, file_path_resolved) - - # Build method contexts - methods: List[MethodContext] = [] - outgoing_resolved = True - incoming_resolved = True - targets_resolved = True - - for symbol in method_symbols: - calls: List[CallInfo] = [] - callers: List[CallInfo] = [] - - if include_calls and dir_index: - try: - outgoing = dir_index.get_outgoing_calls( - str(file_path_resolved), - symbol.name - ) - for target_name, rel_type, line, target_file in outgoing: - calls.append(CallInfo( - symbol_name=target_name, - file_path=target_file, - line=line, - relationship=normalize_relationship_type(rel_type) - )) - if target_file is None: - targets_resolved = False - except Exception as e: - logger.debug(f"Failed to get outgoing calls: {e}") - outgoing_resolved = False - - if include_callers and dir_index: - try: - incoming = dir_index.get_incoming_calls(symbol.name) - for source_name, rel_type, line, source_file in incoming: - callers.append(CallInfo( - symbol_name=source_name, - file_path=source_file, - line=line, - relationship=normalize_relationship_type(rel_type) - )) - except Exception as e: - logger.debug(f"Failed to get incoming calls: {e}") - incoming_resolved = False - - methods.append(MethodContext( - name=symbol.name, - kind=symbol.kind, - line_range=symbol.range if symbol.range else (1, 1), - signature=None, # Could extract from source - calls=calls, - callers=callers - )) - - # Detect language from file extension - language = _detect_language(file_path_resolved) - - # Generate summary - summary = _generate_summary(file_path_resolved, methods, format) - - return FileContextResult( - file_path=str(file_path_resolved), - language=language, - methods=methods, - summary=summary, - discovery_status={ - "outgoing_resolved": outgoing_resolved, - "incoming_resolved": incoming_resolved, - "targets_resolved": targets_resolved - } - ) - - -def _find_dir_index(project_info, file_path: Path) -> Optional[DirIndexStore]: - """Find the dir_index that contains the file. - - Args: - project_info: Project information from registry - file_path: Path to the file - - Returns: - DirIndexStore if found, None otherwise - """ - try: - # Look for _index.db in file's directory or parent directories - current = file_path.parent - while current != current.parent: - index_db = current / "_index.db" - if index_db.exists(): - return DirIndexStore(str(index_db)) - - # Also check in project's index_root - relative = current.relative_to(project_info.source_root) - index_in_cache = project_info.index_root / relative / "_index.db" - if index_in_cache.exists(): - return DirIndexStore(str(index_in_cache)) - - current = current.parent - except Exception as e: - logger.debug(f"Failed to find dir_index: {e}") - - return None - - -def _detect_language(file_path: Path) -> str: - """Detect programming language from file extension. - - Args: - file_path: Path to the file - - Returns: - Language name - """ - ext_map = { - ".py": "python", - ".js": "javascript", - ".ts": "typescript", - ".jsx": "javascript", - ".tsx": "typescript", - ".go": "go", - ".rs": "rust", - ".java": "java", - ".c": "c", - ".cpp": "cpp", - ".h": "c", - ".hpp": "cpp", - } - return ext_map.get(file_path.suffix.lower(), "unknown") - - -def _generate_summary( - file_path: Path, - methods: List[MethodContext], - format: str -) -> str: - """Generate human-readable summary of file context. - - Args: - file_path: Path to the file - methods: List of method contexts - format: Output format (brief | detailed | tree) - - Returns: - Markdown-formatted summary - """ - lines = [f"## {file_path.name} ({len(methods)} methods)\n"] - - for method in methods: - start, end = method.line_range - lines.append(f"### {method.name} (line {start}-{end})") - - if method.calls: - calls_str = ", ".join( - f"{c.symbol_name} ({c.file_path or 'unresolved'}:{c.line})" - if format == "detailed" - else c.symbol_name - for c in method.calls - ) - lines.append(f"- Calls: {calls_str}") - - if method.callers: - callers_str = ", ".join( - f"{c.symbol_name} ({c.file_path}:{c.line})" - if format == "detailed" - else c.symbol_name - for c in method.callers - ) - lines.append(f"- Called by: {callers_str}") - - if not method.calls and not method.callers: - lines.append("- (no call relationships)") - - lines.append("") - - return "\n".join(lines) diff --git a/codex-lens/build/lib/codexlens/api/hover.py b/codex-lens/build/lib/codexlens/api/hover.py deleted file mode 100644 index 7860c98f..00000000 --- a/codex-lens/build/lib/codexlens/api/hover.py +++ /dev/null @@ -1,148 +0,0 @@ -"""get_hover API implementation. - -This module provides the get_hover() function for retrieving -detailed hover information for symbols. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import HoverInfo -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def get_hover( - project_root: str, - symbol_name: str, - file_path: Optional[str] = None -) -> Optional[HoverInfo]: - """Get detailed hover information for a symbol. - - Args: - project_root: Project root directory (for index location) - symbol_name: Name of the symbol to look up - file_path: Optional file path to disambiguate when symbol - appears in multiple files - - Returns: - HoverInfo if symbol found, None otherwise - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Search for the symbol - results = global_index.search( - name=symbol_name, - kind=None, - limit=50, - prefix_mode=False - ) - - if not results: - logger.debug(f"No hover info found for {symbol_name}") - return None - - # If file_path provided, filter to that file - if file_path: - file_path_resolved = str(Path(file_path).resolve()) - matching = [s for s in results if s.file == file_path_resolved] - if matching: - results = matching - - # Take the first result - symbol = results[0] - - # Build hover info - return HoverInfo( - name=symbol.name, - kind=symbol.kind, - signature=_extract_signature(symbol), - documentation=_extract_documentation(symbol), - file_path=symbol.file or "", - line_range=symbol.range if symbol.range else (1, 1), - type_info=_extract_type_info(symbol) - ) - - -def _extract_signature(symbol: Symbol) -> str: - """Extract signature from symbol. - - For now, generates a basic signature based on kind and name. - In a full implementation, this would parse the actual source code. - - Args: - symbol: The symbol to extract signature from - - Returns: - Signature string - """ - if symbol.kind == "function": - return f"def {symbol.name}(...)" - elif symbol.kind == "method": - return f"def {symbol.name}(self, ...)" - elif symbol.kind == "class": - return f"class {symbol.name}" - elif symbol.kind == "variable": - return symbol.name - elif symbol.kind == "constant": - return f"{symbol.name} = ..." - else: - return f"{symbol.kind} {symbol.name}" - - -def _extract_documentation(symbol: Symbol) -> Optional[str]: - """Extract documentation from symbol. - - In a full implementation, this would parse docstrings from source. - For now, returns None. - - Args: - symbol: The symbol to extract documentation from - - Returns: - Documentation string if available, None otherwise - """ - # Would need to read source file and parse docstring - # For V1, return None - return None - - -def _extract_type_info(symbol: Symbol) -> Optional[str]: - """Extract type information from symbol. - - In a full implementation, this would parse type annotations. - For now, returns None. - - Args: - symbol: The symbol to extract type info from - - Returns: - Type info string if available, None otherwise - """ - # Would need to parse type annotations from source - # For V1, return None - return None diff --git a/codex-lens/build/lib/codexlens/api/models.py b/codex-lens/build/lib/codexlens/api/models.py deleted file mode 100644 index 6c53f690..00000000 --- a/codex-lens/build/lib/codexlens/api/models.py +++ /dev/null @@ -1,281 +0,0 @@ -"""API dataclass definitions for codexlens LSP API. - -This module defines all result dataclasses used by the public API layer, -following the patterns established in mcp/schema.py. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field, asdict -from typing import List, Optional, Dict, Tuple - - -# ============================================================================= -# Section 4.2: file_context dataclasses -# ============================================================================= - -@dataclass -class CallInfo: - """Call relationship information. - - Attributes: - symbol_name: Name of the called/calling symbol - file_path: Target file path (may be None if unresolved) - line: Line number of the call - relationship: Type of relationship (call | import | inheritance) - """ - symbol_name: str - file_path: Optional[str] - line: int - relationship: str # call | import | inheritance - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class MethodContext: - """Method context with call relationships. - - Attributes: - name: Method/function name - kind: Symbol kind (function | method | class) - line_range: Start and end line numbers - signature: Function signature (if available) - calls: List of outgoing calls - callers: List of incoming calls - """ - name: str - kind: str # function | method | class - line_range: Tuple[int, int] - signature: Optional[str] - calls: List[CallInfo] = field(default_factory=list) - callers: List[CallInfo] = field(default_factory=list) - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - result = { - "name": self.name, - "kind": self.kind, - "line_range": list(self.line_range), - "calls": [c.to_dict() for c in self.calls], - "callers": [c.to_dict() for c in self.callers], - } - if self.signature is not None: - result["signature"] = self.signature - return result - - -@dataclass -class FileContextResult: - """File context result with method summaries. - - Attributes: - file_path: Path to the analyzed file - language: Programming language - methods: List of method contexts - summary: Human-readable summary - discovery_status: Status flags for call resolution - """ - file_path: str - language: str - methods: List[MethodContext] - summary: str - discovery_status: Dict[str, bool] = field(default_factory=lambda: { - "outgoing_resolved": False, - "incoming_resolved": True, - "targets_resolved": False - }) - - def to_dict(self) -> dict: - """Convert to dictionary for JSON serialization.""" - return { - "file_path": self.file_path, - "language": self.language, - "methods": [m.to_dict() for m in self.methods], - "summary": self.summary, - "discovery_status": self.discovery_status, - } - - -# ============================================================================= -# Section 4.3: find_definition dataclasses -# ============================================================================= - -@dataclass -class DefinitionResult: - """Definition lookup result. - - Attributes: - name: Symbol name - kind: Symbol kind (class, function, method, etc.) - file_path: File where symbol is defined - line: Start line number - end_line: End line number - signature: Symbol signature (if available) - container: Containing class/module (if any) - score: Match score for ranking - """ - name: str - kind: str - file_path: str - line: int - end_line: int - signature: Optional[str] = None - container: Optional[str] = None - score: float = 1.0 - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -# ============================================================================= -# Section 4.4: find_references dataclasses -# ============================================================================= - -@dataclass -class ReferenceResult: - """Reference lookup result. - - Attributes: - file_path: File containing the reference - line: Line number - column: Column number - context_line: The line of code containing the reference - relationship: Type of reference (call | import | type_annotation | inheritance) - """ - file_path: str - line: int - column: int - context_line: str - relationship: str # call | import | type_annotation | inheritance - - def to_dict(self) -> dict: - """Convert to dictionary.""" - return asdict(self) - - -@dataclass -class GroupedReferences: - """References grouped by definition. - - Used when a symbol has multiple definitions (e.g., overloads). - - Attributes: - definition: The definition this group refers to - references: List of references to this definition - """ - definition: DefinitionResult - references: List[ReferenceResult] = field(default_factory=list) - - def to_dict(self) -> dict: - """Convert to dictionary.""" - return { - "definition": self.definition.to_dict(), - "references": [r.to_dict() for r in self.references], - } - - -# ============================================================================= -# Section 4.5: workspace_symbols dataclasses -# ============================================================================= - -@dataclass -class SymbolInfo: - """Symbol information for workspace search. - - Attributes: - name: Symbol name - kind: Symbol kind - file_path: File where symbol is defined - line: Line number - container: Containing class/module (if any) - score: Match score for ranking - """ - name: str - kind: str - file_path: str - line: int - container: Optional[str] = None - score: float = 1.0 - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -# ============================================================================= -# Section 4.6: get_hover dataclasses -# ============================================================================= - -@dataclass -class HoverInfo: - """Hover information for a symbol. - - Attributes: - name: Symbol name - kind: Symbol kind - signature: Symbol signature - documentation: Documentation string (if available) - file_path: File where symbol is defined - line_range: Start and end line numbers - type_info: Type information (if available) - """ - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: Tuple[int, int] - type_info: Optional[str] = None - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - result = { - "name": self.name, - "kind": self.kind, - "signature": self.signature, - "file_path": self.file_path, - "line_range": list(self.line_range), - } - if self.documentation is not None: - result["documentation"] = self.documentation - if self.type_info is not None: - result["type_info"] = self.type_info - return result - - -# ============================================================================= -# Section 4.7: semantic_search dataclasses -# ============================================================================= - -@dataclass -class SemanticResult: - """Semantic search result. - - Attributes: - symbol_name: Name of the matched symbol - kind: Symbol kind - file_path: File where symbol is defined - line: Line number - vector_score: Vector similarity score (None if not available) - structural_score: Structural match score (None if not available) - fusion_score: Combined fusion score - snippet: Code snippet - match_reason: Explanation of why this matched (optional) - """ - symbol_name: str - kind: str - file_path: str - line: int - vector_score: Optional[float] - structural_score: Optional[float] - fusion_score: float - snippet: str - match_reason: Optional[str] = None - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} diff --git a/codex-lens/build/lib/codexlens/api/references.py b/codex-lens/build/lib/codexlens/api/references.py deleted file mode 100644 index 2e3f5f1e..00000000 --- a/codex-lens/build/lib/codexlens/api/references.py +++ /dev/null @@ -1,345 +0,0 @@ -"""Find references API for codexlens. - -This module implements the find_references() function that wraps -ChainSearchEngine.search_references() with grouped result structure -for multi-definition symbols. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional, Dict - -from .models import ( - DefinitionResult, - ReferenceResult, - GroupedReferences, -) -from .utils import ( - resolve_project, - normalize_relationship_type, -) - - -logger = logging.getLogger(__name__) - - -def _read_line_from_file(file_path: str, line: int) -> str: - """Read a specific line from a file. - - Args: - file_path: Path to the file - line: Line number (1-based) - - Returns: - The line content, stripped of trailing whitespace. - Returns empty string if file cannot be read or line doesn't exist. - """ - try: - path = Path(file_path) - if not path.exists(): - return "" - - with path.open("r", encoding="utf-8", errors="replace") as f: - for i, content in enumerate(f, 1): - if i == line: - return content.rstrip() - return "" - except Exception as exc: - logger.debug("Failed to read line %d from %s: %s", line, file_path, exc) - return "" - - -def _transform_to_reference_result( - raw_ref: "RawReferenceResult", -) -> ReferenceResult: - """Transform raw ChainSearchEngine reference to API ReferenceResult. - - Args: - raw_ref: Raw reference result from ChainSearchEngine - - Returns: - API ReferenceResult with context_line and normalized relationship - """ - # Read the actual line from the file - context_line = _read_line_from_file(raw_ref.file_path, raw_ref.line) - - # Normalize relationship type - relationship = normalize_relationship_type(raw_ref.relationship_type) - - return ReferenceResult( - file_path=raw_ref.file_path, - line=raw_ref.line, - column=raw_ref.column, - context_line=context_line, - relationship=relationship, - ) - - -def find_references( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - include_definition: bool = True, - group_by_definition: bool = True, - limit: int = 100, -) -> List[GroupedReferences]: - """Find all reference locations for a symbol. - - Multi-definition case returns grouped results to resolve ambiguity. - - This function wraps ChainSearchEngine.search_references() and groups - the results by definition location. Each GroupedReferences contains - a definition and all references that point to it. - - Args: - project_root: Project root directory path - symbol_name: Name of the symbol to find references for - symbol_kind: Optional symbol kind filter (e.g., 'function', 'class') - include_definition: Whether to include the definition location - in the result (default True) - group_by_definition: Whether to group references by definition. - If False, returns a single group with all references. - (default True) - limit: Maximum number of references to return (default 100) - - Returns: - List of GroupedReferences. Each group contains: - - definition: The DefinitionResult for this symbol definition - - references: List of ReferenceResult pointing to this definition - - Raises: - ValueError: If project_root does not exist or is not a directory - - Examples: - >>> refs = find_references("/path/to/project", "authenticate") - >>> for group in refs: - ... print(f"Definition: {group.definition.file_path}:{group.definition.line}") - ... for ref in group.references: - ... print(f" Reference: {ref.file_path}:{ref.line} ({ref.relationship})") - - Note: - Reference relationship types are normalized: - - 'calls' -> 'call' - - 'imports' -> 'import' - - 'inherits' -> 'inheritance' - """ - # Validate and resolve project root - project_path = resolve_project(project_root) - - # Import here to avoid circular imports - from codexlens.config import Config - from codexlens.storage.registry import RegistryStore - from codexlens.storage.path_mapper import PathMapper - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.search.chain_search import ChainSearchEngine - from codexlens.search.chain_search import ReferenceResult as RawReferenceResult - from codexlens.entities import Symbol - - # Initialize infrastructure - config = Config() - registry = RegistryStore() - mapper = PathMapper(config.index_dir) - - # Create chain search engine - engine = ChainSearchEngine(registry, mapper, config=config) - - try: - # Step 1: Find definitions for the symbol - definitions: List[DefinitionResult] = [] - - if include_definition or group_by_definition: - # Search for symbol definitions - symbols = engine.search_symbols( - name=symbol_name, - source_path=project_path, - kind=symbol_kind, - ) - - # Convert Symbol to DefinitionResult - for sym in symbols: - # Only include exact name matches for definitions - if sym.name != symbol_name: - continue - - # Optionally filter by kind - if symbol_kind and sym.kind != symbol_kind: - continue - - definitions.append(DefinitionResult( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - end_line=sym.range[1] if sym.range else 1, - signature=None, # Not available from Symbol - container=None, # Not available from Symbol - score=1.0, - )) - - # Step 2: Get all references using ChainSearchEngine - raw_references = engine.search_references( - symbol_name=symbol_name, - source_path=project_path, - depth=-1, - limit=limit, - ) - - # Step 3: Transform raw references to API ReferenceResult - api_references: List[ReferenceResult] = [] - for raw_ref in raw_references: - api_ref = _transform_to_reference_result(raw_ref) - api_references.append(api_ref) - - # Step 4: Group references by definition - if group_by_definition and definitions: - return _group_references_by_definition( - definitions=definitions, - references=api_references, - include_definition=include_definition, - ) - else: - # Return single group with placeholder definition or first definition - if definitions: - definition = definitions[0] - else: - # Create placeholder definition when no definition found - definition = DefinitionResult( - name=symbol_name, - kind=symbol_kind or "unknown", - file_path="", - line=0, - end_line=0, - signature=None, - container=None, - score=0.0, - ) - - return [GroupedReferences( - definition=definition, - references=api_references, - )] - - finally: - engine.close() - - -def _group_references_by_definition( - definitions: List[DefinitionResult], - references: List[ReferenceResult], - include_definition: bool = True, -) -> List[GroupedReferences]: - """Group references by their likely definition. - - Uses file proximity heuristic to assign references to definitions. - References in the same file or directory as a definition are - assigned to that definition. - - Args: - definitions: List of definition locations - references: List of reference locations - include_definition: Whether to include definition in results - - Returns: - List of GroupedReferences with references assigned to definitions - """ - import os - - if not definitions: - return [] - - if len(definitions) == 1: - # Single definition - all references belong to it - return [GroupedReferences( - definition=definitions[0], - references=references, - )] - - # Multiple definitions - group by proximity - groups: Dict[int, List[ReferenceResult]] = { - i: [] for i in range(len(definitions)) - } - - for ref in references: - # Find the closest definition by file proximity - best_def_idx = 0 - best_score = -1 - - for i, defn in enumerate(definitions): - score = _proximity_score(ref.file_path, defn.file_path) - if score > best_score: - best_score = score - best_def_idx = i - - groups[best_def_idx].append(ref) - - # Build result groups - result: List[GroupedReferences] = [] - for i, defn in enumerate(definitions): - # Skip definitions with no references if not including definition itself - if not include_definition and not groups[i]: - continue - - result.append(GroupedReferences( - definition=defn, - references=groups[i], - )) - - return result - - -def _proximity_score(ref_path: str, def_path: str) -> int: - """Calculate proximity score between two file paths. - - Args: - ref_path: Reference file path - def_path: Definition file path - - Returns: - Proximity score (higher = closer): - - Same file: 1000 - - Same directory: 100 - - Otherwise: common path prefix length - """ - import os - - if not ref_path or not def_path: - return 0 - - # Normalize paths - ref_path = os.path.normpath(ref_path) - def_path = os.path.normpath(def_path) - - # Same file - if ref_path == def_path: - return 1000 - - ref_dir = os.path.dirname(ref_path) - def_dir = os.path.dirname(def_path) - - # Same directory - if ref_dir == def_dir: - return 100 - - # Common path prefix - try: - common = os.path.commonpath([ref_path, def_path]) - return len(common) - except ValueError: - # No common path (different drives on Windows) - return 0 - - -# Type alias for the raw reference from ChainSearchEngine -class RawReferenceResult: - """Type stub for ChainSearchEngine.ReferenceResult. - - This is only used for type hints and is replaced at runtime - by the actual import. - """ - file_path: str - line: int - column: int - context: str - relationship_type: str diff --git a/codex-lens/build/lib/codexlens/api/semantic.py b/codex-lens/build/lib/codexlens/api/semantic.py deleted file mode 100644 index f17e1c8b..00000000 --- a/codex-lens/build/lib/codexlens/api/semantic.py +++ /dev/null @@ -1,471 +0,0 @@ -"""Semantic search API with RRF fusion. - -This module provides the semantic_search() function for combining -vector, structural, and keyword search with configurable fusion strategies. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional - -from .models import SemanticResult -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def semantic_search( - project_root: str, - query: str, - mode: str = "fusion", - vector_weight: float = 0.5, - structural_weight: float = 0.3, - keyword_weight: float = 0.2, - fusion_strategy: str = "rrf", - kind_filter: Optional[List[str]] = None, - limit: int = 20, - include_match_reason: bool = False, -) -> List[SemanticResult]: - """Semantic search - combining vector and structural search. - - This function provides a high-level API for semantic code search, - combining vector similarity, structural (symbol + relationships), - and keyword-based search methods with configurable fusion. - - Args: - project_root: Project root directory - query: Natural language query - mode: Search mode - - vector: Vector search only - - structural: Structural search only (symbol + relationships) - - fusion: Fusion search (default) - vector_weight: Vector search weight [0, 1] (default 0.5) - structural_weight: Structural search weight [0, 1] (default 0.3) - keyword_weight: Keyword search weight [0, 1] (default 0.2) - fusion_strategy: Fusion strategy (maps to chain_search.py) - - rrf: Reciprocal Rank Fusion (recommended, default) - - staged: Staged cascade -> staged_cascade_search - - binary: Binary rerank cascade -> binary_cascade_search - - hybrid: Hybrid cascade -> hybrid_cascade_search - kind_filter: Symbol type filter (e.g., ["function", "class"]) - limit: Max return count (default 20) - include_match_reason: Generate match reason (heuristic, not LLM) - - Returns: - Results sorted by fusion_score - - Degradation: - - No vector index: vector_score=None, uses FTS + structural search - - No relationship data: structural_score=None, vector search only - - Examples: - >>> results = semantic_search( - ... "/path/to/project", - ... "authentication handler", - ... mode="fusion", - ... fusion_strategy="rrf" - ... ) - >>> for r in results: - ... print(f"{r.symbol_name}: {r.fusion_score:.3f}") - """ - # Validate and resolve project path - project_path = resolve_project(project_root) - - # Normalize weights to sum to 1.0 - total_weight = vector_weight + structural_weight + keyword_weight - if total_weight > 0: - vector_weight = vector_weight / total_weight - structural_weight = structural_weight / total_weight - keyword_weight = keyword_weight / total_weight - else: - # Default to equal weights if all zero - vector_weight = structural_weight = keyword_weight = 1.0 / 3.0 - - # Initialize search infrastructure - try: - from codexlens.config import Config - from codexlens.storage.registry import RegistryStore - from codexlens.storage.path_mapper import PathMapper - from codexlens.search.chain_search import ChainSearchEngine, SearchOptions - except ImportError as exc: - logger.error("Failed to import search dependencies: %s", exc) - return [] - - # Load config - config = Config.load() - - # Get or create registry and mapper - try: - registry = RegistryStore.default() - mapper = PathMapper(registry) - except Exception as exc: - logger.error("Failed to initialize search infrastructure: %s", exc) - return [] - - # Build search options based on mode - search_options = _build_search_options( - mode=mode, - vector_weight=vector_weight, - structural_weight=structural_weight, - keyword_weight=keyword_weight, - limit=limit, - ) - - # Execute search based on fusion_strategy - try: - with ChainSearchEngine(registry, mapper, config=config) as engine: - chain_result = _execute_search( - engine=engine, - query=query, - source_path=project_path, - fusion_strategy=fusion_strategy, - options=search_options, - limit=limit, - ) - except Exception as exc: - logger.error("Search execution failed: %s", exc) - return [] - - # Transform results to SemanticResult - semantic_results = _transform_results( - results=chain_result.results, - mode=mode, - vector_weight=vector_weight, - structural_weight=structural_weight, - keyword_weight=keyword_weight, - kind_filter=kind_filter, - include_match_reason=include_match_reason, - query=query, - ) - - return semantic_results[:limit] - - -def _build_search_options( - mode: str, - vector_weight: float, - structural_weight: float, - keyword_weight: float, - limit: int, -) -> "SearchOptions": - """Build SearchOptions based on mode and weights. - - Args: - mode: Search mode (vector, structural, fusion) - vector_weight: Vector search weight - structural_weight: Structural search weight - keyword_weight: Keyword search weight - limit: Result limit - - Returns: - Configured SearchOptions - """ - from codexlens.search.chain_search import SearchOptions - - # Default options - options = SearchOptions( - total_limit=limit * 2, # Fetch extra for filtering - limit_per_dir=limit, - include_symbols=True, # Always include symbols for structural - ) - - if mode == "vector": - # Pure vector mode - options.hybrid_mode = True - options.enable_vector = True - options.pure_vector = True - options.enable_fuzzy = False - elif mode == "structural": - # Structural only - use FTS + symbols - options.hybrid_mode = True - options.enable_vector = False - options.enable_fuzzy = True - options.include_symbols = True - else: - # Fusion mode (default) - options.hybrid_mode = True - options.enable_vector = vector_weight > 0 - options.enable_fuzzy = keyword_weight > 0 - options.include_symbols = structural_weight > 0 - - # Set custom weights for RRF - if options.enable_vector and keyword_weight > 0: - options.hybrid_weights = { - "vector": vector_weight, - "exact": keyword_weight * 0.7, - "fuzzy": keyword_weight * 0.3, - } - - return options - - -def _execute_search( - engine: "ChainSearchEngine", - query: str, - source_path: Path, - fusion_strategy: str, - options: "SearchOptions", - limit: int, -) -> "ChainSearchResult": - """Execute search using appropriate strategy. - - Maps fusion_strategy to ChainSearchEngine methods: - - rrf: Standard hybrid search with RRF fusion - - staged: staged_cascade_search - - binary: binary_cascade_search - - hybrid: hybrid_cascade_search - - Args: - engine: ChainSearchEngine instance - query: Search query - source_path: Project root path - fusion_strategy: Strategy name - options: Search options - limit: Result limit - - Returns: - ChainSearchResult from the search - """ - from codexlens.search.chain_search import ChainSearchResult - - if fusion_strategy == "staged": - # Use staged cascade search (4-stage pipeline) - return engine.staged_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - elif fusion_strategy == "binary": - # Use binary cascade search (binary coarse + dense fine) - return engine.binary_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - elif fusion_strategy == "hybrid": - # Use hybrid cascade search (FTS+SPLADE+Vector + cross-encoder) - return engine.hybrid_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - else: - # Default: rrf - Standard search with RRF fusion - return engine.search( - query=query, - source_path=source_path, - options=options, - ) - - -def _transform_results( - results: List, - mode: str, - vector_weight: float, - structural_weight: float, - keyword_weight: float, - kind_filter: Optional[List[str]], - include_match_reason: bool, - query: str, -) -> List[SemanticResult]: - """Transform ChainSearchEngine results to SemanticResult. - - Args: - results: List of SearchResult objects - mode: Search mode - vector_weight: Vector weight used - structural_weight: Structural weight used - keyword_weight: Keyword weight used - kind_filter: Optional symbol kind filter - include_match_reason: Whether to generate match reasons - query: Original query (for match reason generation) - - Returns: - List of SemanticResult objects - """ - semantic_results = [] - - for result in results: - # Extract symbol info - symbol_name = getattr(result, "symbol_name", None) - symbol_kind = getattr(result, "symbol_kind", None) - start_line = getattr(result, "start_line", None) - - # Use symbol object if available - if hasattr(result, "symbol") and result.symbol: - symbol_name = symbol_name or result.symbol.name - symbol_kind = symbol_kind or result.symbol.kind - if hasattr(result.symbol, "range") and result.symbol.range: - start_line = start_line or result.symbol.range[0] - - # Filter by kind if specified - if kind_filter and symbol_kind: - if symbol_kind.lower() not in [k.lower() for k in kind_filter]: - continue - - # Determine scores based on mode and metadata - metadata = getattr(result, "metadata", {}) or {} - fusion_score = result.score - - # Try to extract source scores from metadata - source_scores = metadata.get("source_scores", {}) - vector_score: Optional[float] = None - structural_score: Optional[float] = None - - if mode == "vector": - # In pure vector mode, the main score is the vector score - vector_score = result.score - structural_score = None - elif mode == "structural": - # In structural mode, no vector score - vector_score = None - structural_score = result.score - else: - # Fusion mode - try to extract individual scores - if "vector" in source_scores: - vector_score = source_scores["vector"] - elif metadata.get("fusion_method") == "simple_weighted": - # From weighted fusion - vector_score = source_scores.get("vector") - - # Structural score approximation (from exact/fuzzy FTS) - fts_scores = [] - if "exact" in source_scores: - fts_scores.append(source_scores["exact"]) - if "fuzzy" in source_scores: - fts_scores.append(source_scores["fuzzy"]) - if "splade" in source_scores: - fts_scores.append(source_scores["splade"]) - - if fts_scores: - structural_score = max(fts_scores) - - # Build snippet - snippet = getattr(result, "excerpt", "") or getattr(result, "content", "") - if len(snippet) > 500: - snippet = snippet[:500] + "..." - - # Generate match reason if requested - match_reason = None - if include_match_reason: - match_reason = _generate_match_reason( - query=query, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - snippet=snippet, - vector_score=vector_score, - structural_score=structural_score, - ) - - semantic_result = SemanticResult( - symbol_name=symbol_name or Path(result.path).stem, - kind=symbol_kind or "unknown", - file_path=result.path, - line=start_line or 1, - vector_score=vector_score, - structural_score=structural_score, - fusion_score=fusion_score, - snippet=snippet, - match_reason=match_reason, - ) - - semantic_results.append(semantic_result) - - # Sort by fusion_score descending - semantic_results.sort(key=lambda r: r.fusion_score, reverse=True) - - return semantic_results - - -def _generate_match_reason( - query: str, - symbol_name: Optional[str], - symbol_kind: Optional[str], - snippet: str, - vector_score: Optional[float], - structural_score: Optional[float], -) -> str: - """Generate human-readable match reason heuristically. - - This is a simple heuristic-based approach, not LLM-powered. - - Args: - query: Original search query - symbol_name: Symbol name if available - symbol_kind: Symbol kind if available - snippet: Code snippet - vector_score: Vector similarity score - structural_score: Structural match score - - Returns: - Human-readable explanation string - """ - reasons = [] - - # Check for direct name match - query_lower = query.lower() - query_words = set(query_lower.split()) - - if symbol_name: - name_lower = symbol_name.lower() - # Direct substring match - if query_lower in name_lower or name_lower in query_lower: - reasons.append(f"Symbol name '{symbol_name}' matches query") - # Word overlap - name_words = set(_split_camel_case(symbol_name).lower().split()) - overlap = query_words & name_words - if overlap and not reasons: - reasons.append(f"Symbol name contains: {', '.join(overlap)}") - - # Check snippet for keyword matches - snippet_lower = snippet.lower() - matching_words = [w for w in query_words if w in snippet_lower and len(w) > 2] - if matching_words and len(reasons) < 2: - reasons.append(f"Code contains keywords: {', '.join(matching_words[:3])}") - - # Add score-based reasoning - if vector_score is not None and vector_score > 0.7: - reasons.append("High semantic similarity") - elif vector_score is not None and vector_score > 0.5: - reasons.append("Moderate semantic similarity") - - if structural_score is not None and structural_score > 0.8: - reasons.append("Strong structural match") - - # Symbol kind context - if symbol_kind and len(reasons) < 3: - reasons.append(f"Matched {symbol_kind}") - - if not reasons: - reasons.append("Partial relevance based on content analysis") - - return "; ".join(reasons[:3]) - - -def _split_camel_case(name: str) -> str: - """Split camelCase and PascalCase to words. - - Args: - name: Symbol name in camelCase or PascalCase - - Returns: - Space-separated words - """ - import re - - # Insert space before uppercase letters - result = re.sub(r"([a-z])([A-Z])", r"\1 \2", name) - # Insert space before uppercase followed by lowercase - result = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", result) - # Replace underscores with spaces - result = result.replace("_", " ") - - return result diff --git a/codex-lens/build/lib/codexlens/api/symbols.py b/codex-lens/build/lib/codexlens/api/symbols.py deleted file mode 100644 index 8faf248f..00000000 --- a/codex-lens/build/lib/codexlens/api/symbols.py +++ /dev/null @@ -1,146 +0,0 @@ -"""workspace_symbols API implementation. - -This module provides the workspace_symbols() function for searching -symbols across the entire workspace with prefix matching. -""" - -from __future__ import annotations - -import fnmatch -import logging -from pathlib import Path -from typing import List, Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import SymbolInfo -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def workspace_symbols( - project_root: str, - query: str, - kind_filter: Optional[List[str]] = None, - file_pattern: Optional[str] = None, - limit: int = 50 -) -> List[SymbolInfo]: - """Search for symbols across the entire workspace. - - Uses prefix matching for efficient searching. - - Args: - project_root: Project root directory (for index location) - query: Search query (prefix match) - kind_filter: Optional list of symbol kinds to include - (e.g., ["class", "function"]) - file_pattern: Optional glob pattern to filter by file path - (e.g., "*.py", "src/**/*.ts") - limit: Maximum number of results to return - - Returns: - List of SymbolInfo sorted by score - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Search with prefix matching - # If kind_filter has multiple kinds, we need to search for each - all_results: List[Symbol] = [] - - if kind_filter and len(kind_filter) > 0: - # Search for each kind separately - for kind in kind_filter: - results = global_index.search( - name=query, - kind=kind, - limit=limit, - prefix_mode=True - ) - all_results.extend(results) - else: - # Search without kind filter - all_results = global_index.search( - name=query, - kind=None, - limit=limit, - prefix_mode=True - ) - - logger.debug(f"Found {len(all_results)} symbols matching '{query}'") - - # Apply file pattern filter if specified - if file_pattern: - all_results = [ - sym for sym in all_results - if sym.file and fnmatch.fnmatch(sym.file, file_pattern) - ] - logger.debug(f"After file filter '{file_pattern}': {len(all_results)} symbols") - - # Convert to SymbolInfo and sort by relevance - symbols = [ - SymbolInfo( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - container=None, # Could extract from parent - score=_calculate_score(sym.name, query) - ) - for sym in all_results - ] - - # Sort by score (exact matches first) - symbols.sort(key=lambda s: s.score, reverse=True) - - return symbols[:limit] - - -def _calculate_score(symbol_name: str, query: str) -> float: - """Calculate relevance score for a symbol match. - - Scoring: - - Exact match: 1.0 - - Prefix match: 0.8 + 0.2 * (query_len / symbol_len) - - Case-insensitive match: 0.6 - - Args: - symbol_name: The matched symbol name - query: The search query - - Returns: - Score between 0.0 and 1.0 - """ - if symbol_name == query: - return 1.0 - - if symbol_name.lower() == query.lower(): - return 0.9 - - if symbol_name.startswith(query): - ratio = len(query) / len(symbol_name) - return 0.8 + 0.2 * ratio - - if symbol_name.lower().startswith(query.lower()): - ratio = len(query) / len(symbol_name) - return 0.6 + 0.2 * ratio - - return 0.5 diff --git a/codex-lens/build/lib/codexlens/api/utils.py b/codex-lens/build/lib/codexlens/api/utils.py deleted file mode 100644 index 3621533a..00000000 --- a/codex-lens/build/lib/codexlens/api/utils.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Utility functions for the codexlens API. - -This module provides helper functions for: -- Project resolution -- Relationship type normalization -- Result ranking by proximity -""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import List, Optional, TypeVar, Callable - -from .models import DefinitionResult - - -# Type variable for generic ranking -T = TypeVar('T') - - -def resolve_project(project_root: str) -> Path: - """Resolve and validate project root path. - - Args: - project_root: Path to project root (relative or absolute) - - Returns: - Resolved absolute Path - - Raises: - ValueError: If path does not exist or is not a directory - """ - path = Path(project_root).resolve() - if not path.exists(): - raise ValueError(f"Project root does not exist: {path}") - if not path.is_dir(): - raise ValueError(f"Project root is not a directory: {path}") - return path - - -# Relationship type normalization mapping -_RELATIONSHIP_NORMALIZATION = { - # Plural to singular - "calls": "call", - "imports": "import", - "inherits": "inheritance", - "uses": "use", - # Already normalized (passthrough) - "call": "call", - "import": "import", - "inheritance": "inheritance", - "use": "use", - "type_annotation": "type_annotation", -} - - -def normalize_relationship_type(relationship: str) -> str: - """Normalize relationship type to canonical form. - - Converts plural forms and variations to standard singular forms: - - 'calls' -> 'call' - - 'imports' -> 'import' - - 'inherits' -> 'inheritance' - - 'uses' -> 'use' - - Args: - relationship: Raw relationship type string - - Returns: - Normalized relationship type - - Examples: - >>> normalize_relationship_type('calls') - 'call' - >>> normalize_relationship_type('inherits') - 'inheritance' - >>> normalize_relationship_type('call') - 'call' - """ - return _RELATIONSHIP_NORMALIZATION.get(relationship.lower(), relationship) - - -def rank_by_proximity( - results: List[DefinitionResult], - file_context: Optional[str] = None -) -> List[DefinitionResult]: - """Rank results by file path proximity to context. - - V1 Implementation: Uses path-based proximity scoring. - - Scoring algorithm: - 1. Same directory: highest score (100) - 2. Otherwise: length of common path prefix - - Args: - results: List of definition results to rank - file_context: Reference file path for proximity calculation. - If None, returns results unchanged. - - Returns: - Results sorted by proximity score (highest first) - - Examples: - >>> results = [ - ... DefinitionResult(name="foo", kind="function", - ... file_path="/a/b/c.py", line=1, end_line=10), - ... DefinitionResult(name="foo", kind="function", - ... file_path="/a/x/y.py", line=1, end_line=10), - ... ] - >>> ranked = rank_by_proximity(results, "/a/b/test.py") - >>> ranked[0].file_path - '/a/b/c.py' - """ - if not file_context or not results: - return results - - def proximity_score(result: DefinitionResult) -> int: - """Calculate proximity score for a result.""" - result_dir = os.path.dirname(result.file_path) - context_dir = os.path.dirname(file_context) - - # Same directory gets highest score - if result_dir == context_dir: - return 100 - - # Otherwise, score by common path prefix length - try: - common = os.path.commonpath([result.file_path, file_context]) - return len(common) - except ValueError: - # No common path (different drives on Windows) - return 0 - - return sorted(results, key=proximity_score, reverse=True) - - -def rank_by_score( - results: List[T], - score_fn: Callable[[T], float], - reverse: bool = True -) -> List[T]: - """Generic ranking function by custom score. - - Args: - results: List of items to rank - score_fn: Function to extract score from item - reverse: If True, highest scores first (default) - - Returns: - Sorted list - """ - return sorted(results, key=score_fn, reverse=reverse) diff --git a/codex-lens/build/lib/codexlens/cli/__init__.py b/codex-lens/build/lib/codexlens/cli/__init__.py deleted file mode 100644 index 18523b4c..00000000 --- a/codex-lens/build/lib/codexlens/cli/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -"""CLI package for CodexLens.""" - -from __future__ import annotations - -import sys -import os - -# Force UTF-8 encoding for Windows console -# This ensures Chinese characters display correctly instead of GBK garbled text -if sys.platform == "win32": - # Set environment variable for Python I/O encoding - os.environ.setdefault("PYTHONIOENCODING", "utf-8") - - # Reconfigure stdout/stderr to use UTF-8 if possible - try: - if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - if hasattr(sys.stderr, "reconfigure"): - sys.stderr.reconfigure(encoding="utf-8", errors="replace") - except Exception: - # Fallback: some environments don't support reconfigure - pass - -from .commands import app - -__all__ = ["app"] - diff --git a/codex-lens/build/lib/codexlens/cli/commands.py b/codex-lens/build/lib/codexlens/cli/commands.py deleted file mode 100644 index ebf81101..00000000 --- a/codex-lens/build/lib/codexlens/cli/commands.py +++ /dev/null @@ -1,4494 +0,0 @@ -"""Typer commands for CodexLens.""" - -from __future__ import annotations - -import json -import logging -import os -import shutil -import sqlite3 -from pathlib import Path -from typing import Annotated, Any, Dict, Iterable, List, Optional - -import typer -from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn -from rich.table import Table - -from codexlens.config import Config -from codexlens.entities import IndexedFile, SearchResult, Symbol -from codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore, ProjectInfo -from codexlens.storage.index_tree import IndexTreeBuilder -from codexlens.storage.dir_index import DirIndexStore -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.watcher import WatcherManager, WatcherConfig - -from .output import ( - console, - print_json, - render_file_inspect, - render_search_results, - render_status, - render_symbols, -) - -app = typer.Typer(help="CodexLens CLI — local code indexing and search.") - -# Index subcommand group for reorganized commands -index_app = typer.Typer(help="Index management commands (init, embeddings, splade, binary, status, migrate, all)") -app.add_typer(index_app, name="index") - - -def _deprecated_command_warning(old_name: str, new_name: str) -> None: - """Display deprecation warning for renamed commands. - - Args: - old_name: The old command name being deprecated - new_name: The new command name to use instead - """ - console.print( - f"[yellow]Warning:[/yellow] '{old_name}' is deprecated. " - f"Use '{new_name}' instead." - ) - - -def _configure_logging(verbose: bool, json_mode: bool = False) -> None: - """Configure logging level. - - In JSON mode, suppress INFO logs to keep stderr clean for error parsing. - Only WARNING and above are shown to avoid mixing logs with JSON output. - """ - if json_mode and not verbose: - # In JSON mode, suppress INFO logs to keep stderr clean - level = logging.WARNING - else: - level = logging.DEBUG if verbose else logging.INFO - logging.basicConfig(level=level, format="%(levelname)s %(message)s") - - -def _parse_languages(raw: Optional[List[str]]) -> Optional[List[str]]: - if not raw: - return None - langs: List[str] = [] - for item in raw: - for part in item.split(","): - part = part.strip() - if part: - langs.append(part) - return langs or None - - -def _get_index_root() -> Path: - """Get the index root directory from config or default. - - Priority order: - 1. CODEXLENS_INDEX_DIR environment variable - 2. index_dir from ~/.codexlens/config.json - 3. Default: ~/.codexlens/indexes - """ - env_override = os.getenv("CODEXLENS_INDEX_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - - # Read from config.json - config_file = Path.home() / ".codexlens" / "config.json" - if config_file.exists(): - try: - cfg = json.loads(config_file.read_text(encoding="utf-8")) - if "index_dir" in cfg: - return Path(cfg["index_dir"]).expanduser().resolve() - except (json.JSONDecodeError, OSError): - pass # Fall through to default - - return Path.home() / ".codexlens" / "indexes" - - -def _get_registry_path() -> Path: - """Get the registry database path.""" - env_override = os.getenv("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() / "registry.db" - return Path.home() / ".codexlens" / "registry.db" - - -@index_app.command("init") -def index_init( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit indexing to specific languages (repeat or comma-separated).", - ), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."), - no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."), - backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."), - model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Initialize or rebuild the index for a directory. - - Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure. - Set CODEXLENS_INDEX_DIR to customize the index location. - - By default, uses incremental indexing (skip unchanged files). - Use --force to rebuild all files regardless of modification time. - - If semantic search dependencies are installed, automatically generates embeddings - after indexing completes. Use --no-embeddings to skip this step. - - Backend Options (--backend): - - fastembed: Local ONNX-based embeddings (default, no API calls) - - litellm: Remote API embeddings via ccw-litellm (requires API keys) - - Model Options (--model): - - For fastembed backend: Use profile names (fast, code, multilingual, balanced) - - For litellm backend: Use model names (e.g., text-embedding-3-small, text-embedding-ada-002) - """ - _configure_logging(verbose, json_mode) - config = Config() - - # Fallback to settings.json config if CLI params not provided - config.load_settings() # Ensure settings are loaded - actual_backend = backend or config.embedding_backend - actual_model = model or config.embedding_model - - languages = _parse_languages(language) - base_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - builder = IndexTreeBuilder(registry, mapper, config, incremental=not force) - - if force: - console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]") - else: - console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]") - - build_result = builder.build( - source_root=base_path, - languages=languages, - workers=workers, - force_full=force, - ) - - result = { - "path": str(base_path), - "files_indexed": build_result.total_files, - "dirs_indexed": build_result.total_dirs, - "index_root": str(build_result.index_root), - "project_id": build_result.project_id, - "languages": languages or sorted(config.supported_languages.keys()), - "errors": len(build_result.errors), - } - - if not json_mode: - console.print(f"[green]OK[/green] Indexed [bold]{build_result.total_files}[/bold] files in [bold]{build_result.total_dirs}[/bold] directories") - console.print(f" Index root: {build_result.index_root}") - if build_result.errors: - console.print(f" [yellow]Warnings:[/yellow] {len(build_result.errors)} errors") - - # Auto-generate embeddings if the requested backend is available - if not no_embeddings: - try: - from codexlens.semantic import is_embedding_backend_available - from codexlens.cli.embedding_manager import generate_embeddings_recursive, get_embeddings_status - - # Validate embedding backend - valid_backends = ["fastembed", "litellm"] - if actual_backend not in valid_backends: - error_msg = f"Invalid embedding backend: {actual_backend}. Must be one of: {', '.join(valid_backends)}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - backend_available, backend_error = is_embedding_backend_available(actual_backend) - - if backend_available: - # Use the index root directory (not the _index.db file) - index_root = Path(build_result.index_root) - - if not json_mode: - console.print("\n[bold]Generating embeddings...[/bold]") - console.print(f"Backend: [cyan]{actual_backend}[/cyan]") - console.print(f"Model: [cyan]{actual_model}[/cyan]") - else: - # Output progress message for JSON mode (parsed by Node.js) - print("Generating embeddings...", flush=True) - - # Progress callback - outputs progress for both json and non-json modes - # Node.js parseProgressLine() expects formats like: - # - "Batch X: N files, M chunks" - # - "Processing N files" - # - "Finalizing index" - def progress_update(msg: str): - if json_mode: - # Output without prefix so Node.js can parse it - # Strip leading spaces that embedding_manager adds - print(msg.strip(), flush=True) - elif verbose: - console.print(f" {msg}") - - embed_result = generate_embeddings_recursive( - index_root, - embedding_backend=actual_backend, - model_profile=actual_model, - force=False, # Don't force regenerate during init - chunk_size=2000, - progress_callback=progress_update, # Always use callback - max_workers=max_workers, - ) - - if embed_result["success"]: - embed_data = embed_result["result"] - - # Output completion message for Node.js to parse - if json_mode: - print(f"Embeddings complete: {embed_data['total_chunks_created']} chunks", flush=True) - - # Get comprehensive coverage statistics - status_result = get_embeddings_status(index_root) - if status_result["success"]: - coverage = status_result["result"] - result["embeddings"] = { - "generated": True, - "total_indexes": coverage["total_indexes"], - "total_files": coverage["total_files"], - "files_with_embeddings": coverage["files_with_embeddings"], - "coverage_percent": coverage["coverage_percent"], - "total_chunks": coverage["total_chunks"], - } - else: - result["embeddings"] = { - "generated": True, - "total_chunks": embed_data["total_chunks_created"], - "files_processed": embed_data["total_files_processed"], - } - - if not json_mode: - console.print(f"[green]✓[/green] Generated embeddings for [bold]{embed_data['total_files_processed']}[/bold] files") - console.print(f" Total chunks: [bold]{embed_data['total_chunks_created']}[/bold]") - console.print(f" Indexes processed: [bold]{embed_data['indexes_successful']}/{embed_data['indexes_processed']}[/bold]") - else: - if not json_mode: - console.print(f"[yellow]Warning:[/yellow] Embedding generation failed: {embed_result.get('error', 'Unknown error')}") - result["embeddings"] = { - "generated": False, - "error": embed_result.get("error"), - } - else: - if not json_mode and verbose: - console.print(f"[dim]Embedding backend '{actual_backend}' not available. Skipping embeddings.[/dim]") - result["embeddings"] = { - "generated": False, - "error": backend_error or "Embedding backend not available", - } - except Exception as e: - if not json_mode and verbose: - console.print(f"[yellow]Warning:[/yellow] Could not generate embeddings: {e}") - result["embeddings"] = { - "generated": False, - "error": str(e), - } - else: - result["embeddings"] = { - "generated": False, - "error": "Skipped (--no-embeddings)", - } - - # Output final JSON result with embeddings status - if json_mode: - print_json(success=True, result=result) - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Init failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except ConfigError as exc: - if json_mode: - print_json(success=False, error=f"Configuration error: {exc}") - else: - console.print(f"[red]Init failed (config):[/red] {exc}") - raise typer.Exit(code=1) - except ParseError as exc: - if json_mode: - print_json(success=False, error=f"Parse error: {exc}") - else: - console.print(f"[red]Init failed (parse):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Init failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Init failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def watch( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit watching to specific languages (repeat or comma-separated).", - ), - debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose logging."), -) -> None: - """Watch directory for changes and update index incrementally. - - Monitors filesystem events and automatically updates the index - when files are created, modified, or deleted. - - The directory must already be indexed (run 'codexlens init' first). - - Press Ctrl+C to stop watching. - - Examples: - codexlens watch . - codexlens watch /path/to/project --debounce 500 --verbose - codexlens watch . --language python,typescript - """ - _configure_logging(verbose) - - from codexlens.watcher.events import IndexResult - - base_path = path.expanduser().resolve() - - # Check if path is indexed - mapper = PathMapper() - index_db = mapper.source_to_index_db(base_path) - if not index_db.exists(): - console.print(f"[red]Error:[/red] Directory not indexed: {base_path}") - console.print("Run 'codexlens init' first to create the index.") - raise typer.Exit(code=1) - - # Parse languages - languages = _parse_languages(language) - - # Create watcher config - watcher_config = WatcherConfig( - debounce_ms=debounce, - languages=languages, - ) - - # Callback for indexed files - def on_indexed(result: IndexResult) -> None: - if result.files_indexed > 0: - console.print(f" [green]Indexed:[/green] {result.files_indexed} files ({result.symbols_added} symbols)") - if result.files_removed > 0: - console.print(f" [yellow]Removed:[/yellow] {result.files_removed} files") - if result.errors: - for error in result.errors[:3]: # Show first 3 errors - console.print(f" [red]Error:[/red] {error}") - - console.print(f"[bold]Watching:[/bold] {base_path}") - console.print(f" Debounce: {debounce}ms") - if languages: - console.print(f" Languages: {', '.join(languages)}") - console.print(" Press Ctrl+C to stop.\n") - - manager: WatcherManager | None = None - try: - manager = WatcherManager( - root_path=base_path, - watcher_config=watcher_config, - on_indexed=on_indexed, - ) - manager.start() - manager.wait() - except KeyboardInterrupt: - pass - except Exception as exc: - console.print(f"[red]Error:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if manager is not None: - manager.stop() - console.print("\n[dim]Watcher stopped.[/dim]") - - -@app.command() -def search( - query: str = typer.Argument(..., help="Search query."), - path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."), - limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."), - offset: int = typer.Option(0, "--offset", min=0, help="Pagination offset - skip first N results."), - depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."), - files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."), - method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."), - use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."), - code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."), - exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."), - # Hidden advanced options for backward compatibility - weights: Optional[str] = typer.Option( - None, - "--weights", "-w", - hidden=True, - help="[Advanced] RRF weights as key=value pairs." - ), - cascade_strategy: Optional[str] = typer.Option( - None, - "--cascade-strategy", - hidden=True, - help="[Advanced] Cascade strategy for --method cascade." - ), - # Hidden deprecated parameter for backward compatibility - mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Search indexed file contents. - - Uses chain search across directory indexes. - Use --depth to limit search recursion (0 = current dir only). - - Search Methods: - - dense_rerank (default): Semantic search using Dense embedding coarse retrieval + - Cross-encoder reranking. Best for natural language queries and code understanding. - - fts: Full-text search using FTS5 (unicode61 tokenizer). Best for exact code - identifiers like function/class names. Use --use-fuzzy for typo tolerance. - - Method Selection Guide: - - Code identifiers (function/class names): fts - - Natural language queries: dense_rerank (default) - - Typo-tolerant search: fts --use-fuzzy - - Requirements: - The dense_rerank method requires pre-generated embeddings. - Use 'codexlens embeddings-generate' to create embeddings first. - - Examples: - # Default semantic search (dense_rerank) - codexlens search "authentication logic" - - # Exact code identifier search - codexlens search "authenticate_user" --method fts - - # Typo-tolerant fuzzy search - codexlens search "authentcate" --method fts --use-fuzzy - """ - _configure_logging(verbose, json_mode) - search_path = path.expanduser().resolve() - - # Handle deprecated --mode parameter - actual_method = method - if mode is not None: - # Show deprecation warning - if not json_mode: - console.print("[yellow]Warning: --mode is deprecated, use --method instead.[/yellow]") - - # Map old mode values to new method values - mode_to_method = { - "auto": "hybrid", - "exact": "fts", - "fuzzy": "fts", # with use_fuzzy=True - "hybrid": "hybrid", - "vector": "vector", - "pure-vector": "vector", - } - - if mode in mode_to_method: - actual_method = mode_to_method[mode] - # Enable fuzzy for old fuzzy mode - if mode == "fuzzy": - use_fuzzy = True - else: - if json_mode: - print_json(success=False, error=f"Invalid deprecated mode: {mode}. Use --method instead.") - else: - console.print(f"[red]Invalid deprecated mode:[/red] {mode}") - console.print("[dim]Use --method with: fts, vector, splade, hybrid, cascade[/dim]") - raise typer.Exit(code=1) - - # Configure search (load settings from file) - config = Config.load() - - # Validate method - simplified interface exposes only dense_rerank and fts - # Other methods (vector, splade, hybrid, cascade) are hidden but still work for backward compatibility - valid_methods = ["fts", "dense_rerank", "vector", "splade", "hybrid", "cascade"] - if actual_method not in valid_methods: - if json_mode: - print_json(success=False, error=f"Invalid method: {actual_method}. Use 'dense_rerank' (semantic) or 'fts' (exact keyword).") - else: - console.print(f"[red]Invalid method:[/red] {actual_method}") - console.print("[dim]Use 'dense_rerank' (semantic, default) or 'fts' (exact keyword)[/dim]") - raise typer.Exit(code=1) - - # Map dense_rerank to cascade method internally - internal_cascade_strategy = cascade_strategy - if actual_method == "dense_rerank": - actual_method = "cascade" - internal_cascade_strategy = "dense_rerank" - - # Validate cascade_strategy if provided (for advanced users) - if internal_cascade_strategy is not None: - valid_strategies = ["binary", "hybrid", "binary_rerank", "dense_rerank"] - if internal_cascade_strategy not in valid_strategies: - if json_mode: - print_json(success=False, error=f"Invalid cascade strategy: {internal_cascade_strategy}. Must be one of: {', '.join(valid_strategies)}") - else: - console.print(f"[red]Invalid cascade strategy:[/red] {internal_cascade_strategy}") - console.print(f"[dim]Valid strategies: {', '.join(valid_strategies)}[/dim]") - raise typer.Exit(code=1) - - # Parse custom weights if provided - hybrid_weights = None - if weights: - try: - # Check if using key=value format (new) or legacy comma-separated format - if "=" in weights: - # New format: splade=0.4,vector=0.6 or exact=0.3,fuzzy=0.1,vector=0.6 - weight_dict = {} - for pair in weights.split(","): - if "=" in pair: - key, val = pair.split("=", 1) - weight_dict[key.strip()] = float(val.strip()) - else: - raise ValueError("Mixed format not supported - use all key=value pairs") - - # Validate and normalize weights - weight_sum = sum(weight_dict.values()) - if abs(weight_sum - 1.0) > 0.01: - if not json_mode: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - weight_dict = {k: v / weight_sum for k, v in weight_dict.items()} - - hybrid_weights = weight_dict - else: - # Legacy format: 0.3,0.1,0.6 (exact,fuzzy,vector) - weight_parts = [float(w.strip()) for w in weights.split(",")] - if len(weight_parts) == 3: - weight_sum = sum(weight_parts) - if abs(weight_sum - 1.0) > 0.01: - if not json_mode: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - weight_parts = [w / weight_sum for w in weight_parts] - hybrid_weights = { - "exact": weight_parts[0], - "fuzzy": weight_parts[1], - "vector": weight_parts[2], - } - elif len(weight_parts) == 2: - # Two values: assume splade,vector - weight_sum = sum(weight_parts) - if abs(weight_sum - 1.0) > 0.01: - if not json_mode: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - weight_parts = [w / weight_sum for w in weight_parts] - hybrid_weights = { - "splade": weight_parts[0], - "vector": weight_parts[1], - } - else: - if not json_mode: - console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]") - except ValueError as e: - if not json_mode: - console.print(f"[yellow]Warning: Invalid weights format ({e}). Using defaults.[/yellow]") - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - engine = ChainSearchEngine(registry, mapper, config=config) - - # Map method to SearchOptions flags - # fts: FTS-only search (optionally with fuzzy) - # vector: Pure vector semantic search - # splade: SPLADE sparse neural search - # hybrid: RRF fusion of sparse + dense - # cascade: Two-stage binary + dense retrieval - if actual_method == "fts": - hybrid_mode = False - enable_fuzzy = use_fuzzy - enable_vector = False - pure_vector = False - enable_splade = False - enable_cascade = False - elif actual_method == "vector": - hybrid_mode = True - enable_fuzzy = False - enable_vector = True - pure_vector = True - enable_splade = False - enable_cascade = False - elif actual_method == "splade": - hybrid_mode = True - enable_fuzzy = False - enable_vector = False - pure_vector = False - enable_splade = True - enable_cascade = False - elif actual_method == "hybrid": - hybrid_mode = True - enable_fuzzy = use_fuzzy - enable_vector = True - pure_vector = False - enable_splade = True # SPLADE is preferred sparse in hybrid - enable_cascade = False - elif actual_method == "cascade": - hybrid_mode = True - enable_fuzzy = False - enable_vector = True - pure_vector = False - enable_splade = False - enable_cascade = True - else: - raise ValueError(f"Invalid method: {actual_method}") - - # Parse exclude_extensions from comma-separated string - exclude_exts_list = None - if exclude_extensions: - exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()] - - options = SearchOptions( - depth=depth, - total_limit=limit, - offset=offset, - files_only=files_only, - code_only=code_only, - exclude_extensions=exclude_exts_list, - hybrid_mode=hybrid_mode, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - pure_vector=pure_vector, - enable_splade=enable_splade, - enable_cascade=enable_cascade, - hybrid_weights=hybrid_weights, - ) - - if files_only: - file_paths = engine.search_files_only(query, search_path, options) - payload = {"query": query, "count": len(file_paths), "files": file_paths} - if json_mode: - print_json(success=True, result=payload) - else: - for fp in file_paths: - console.print(fp) - else: - # Dispatch to cascade_search for cascade method - if actual_method == "cascade": - result = engine.cascade_search(query, search_path, k=limit, options=options, strategy=internal_cascade_strategy) - else: - result = engine.search(query, search_path, options) - results_list = [ - { - "path": r.path, - "score": r.score, - "excerpt": r.excerpt, - "content": r.content, # Full function/class body - "source": getattr(r, "search_source", None), - "symbol": getattr(r, "symbol", None), - } - for r in result.results - ] - - payload = { - "query": query, - "method": actual_method, - "count": len(results_list), - "results": results_list, - "stats": { - "dirs_searched": result.stats.dirs_searched, - "files_matched": result.stats.files_matched, - "time_ms": result.stats.time_ms, - }, - } - if json_mode: - print_json(success=True, result=payload) - else: - render_search_results(result.results, verbose=verbose) - console.print(f"[dim]Method: {actual_method} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]") - - except SearchError as exc: - if json_mode: - print_json(success=False, error=f"Search error: {exc}") - else: - console.print(f"[red]Search failed (query):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Search failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Search failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def symbol( - name: str = typer.Argument(..., help="Symbol name to look up."), - path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."), - kind: Optional[str] = typer.Option( - None, - "--kind", - "-k", - help="Filter by kind (function|class|method).", - ), - limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max symbols."), - depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Look up symbols by name and optional kind.""" - _configure_logging(verbose, json_mode) - search_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - engine = ChainSearchEngine(registry, mapper, config=config) - options = SearchOptions(depth=depth, total_limit=limit) - - syms = engine.search_symbols(name, search_path, kind=kind, options=options) - - payload = {"name": name, "kind": kind, "count": len(syms), "symbols": syms} - if json_mode: - print_json(success=True, result=payload) - else: - render_symbols(syms) - - except SearchError as exc: - if json_mode: - print_json(success=False, error=f"Search error: {exc}") - else: - console.print(f"[red]Symbol lookup failed (search):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Symbol lookup failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Symbol lookup failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def inspect( - file: Path = typer.Argument(..., exists=True, dir_okay=False, help="File to analyze."), - symbols: bool = typer.Option(True, "--symbols/--no-symbols", help="Show discovered symbols."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Analyze a single file and display symbols.""" - _configure_logging(verbose, json_mode) - config = Config() - factory = ParserFactory(config) - - file_path = file.expanduser().resolve() - try: - text = file_path.read_text(encoding="utf-8", errors="ignore") - language_id = config.language_for_path(file_path) or "unknown" - parser = factory.get_parser(language_id) - indexed = parser.parse(text, file_path) - payload = {"file": indexed, "content_lines": len(text.splitlines())} - if json_mode: - print_json(success=True, result=payload) - else: - if symbols: - render_file_inspect(indexed.path, indexed.language, indexed.symbols) - else: - render_status({"file": indexed.path, "language": indexed.language}) - except ParseError as exc: - if json_mode: - print_json(success=False, error=f"Parse error: {exc}") - else: - console.print(f"[red]Inspect failed (parse):[/red] {exc}") - raise typer.Exit(code=1) - except FileNotFoundError as exc: - if json_mode: - print_json(success=False, error=f"File not found: {exc}") - else: - console.print(f"[red]Inspect failed (file not found):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Inspect failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Inspect failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command() -def status( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Show index status and configuration.""" - _configure_logging(verbose, json_mode) - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Get all projects - projects = registry.list_projects() - - # Calculate total stats - total_files = sum(p.total_files for p in projects) - total_dirs = sum(p.total_dirs for p in projects) - - # Get index root size - index_root = mapper.index_root - index_size = 0 - if index_root.exists(): - for f in index_root.rglob("*"): - if f.is_file(): - index_size += f.stat().st_size - - # Check schema version and enabled features - schema_version = None - has_dual_fts = False - if projects and index_root.exists(): - # Check first index database for features - index_files = list(index_root.rglob("_index.db")) - if index_files: - try: - with DirIndexStore(index_files[0]) as store: - with store._lock: - conn = store._get_connection() - schema_version = store._get_schema_version(conn) - # Check if dual FTS tables exist - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('files_fts_exact', 'files_fts_fuzzy')" - ) - fts_tables = [row[0] for row in cursor.fetchall()] - has_dual_fts = len(fts_tables) == 2 - except Exception: - pass - - # Check embeddings coverage - embeddings_info = None - has_vector_search = False - try: - from codexlens.cli.embedding_manager import get_embeddings_status - - if index_root.exists(): - embed_status = get_embeddings_status(index_root) - if embed_status["success"]: - embeddings_info = embed_status["result"] - # Enable vector search if coverage >= 50% - has_vector_search = embeddings_info["coverage_percent"] >= 50.0 - except ImportError: - # Embedding manager not available - pass - except Exception as e: - logging.debug(f"Failed to get embeddings status: {e}") - - stats = { - "index_root": str(index_root), - "registry_path": str(_get_registry_path()), - "projects_count": len(projects), - "total_files": total_files, - "total_dirs": total_dirs, - "index_size_bytes": index_size, - "index_size_mb": round(index_size / (1024 * 1024), 2), - "schema_version": schema_version, - "features": { - "exact_fts": True, # Always available - "fuzzy_fts": has_dual_fts, - "hybrid_search": has_dual_fts, - "vector_search": has_vector_search, - }, - } - - # Add embeddings info if available - if embeddings_info: - stats["embeddings"] = embeddings_info - - if json_mode: - print_json(success=True, result=stats) - else: - console.print("[bold]CodexLens Status[/bold]") - console.print(f" Index Root: {stats['index_root']}") - console.print(f" Registry: {stats['registry_path']}") - console.print(f" Projects: {stats['projects_count']}") - console.print(f" Total Files: {stats['total_files']}") - console.print(f" Total Directories: {stats['total_dirs']}") - console.print(f" Index Size: {stats['index_size_mb']} MB") - if schema_version: - console.print(f" Schema Version: {schema_version}") - console.print("\n[bold]Search Backends:[/bold]") - console.print(f" Exact FTS: ✓ (unicode61)") - if has_dual_fts: - console.print(f" Fuzzy FTS: ✓ (trigram)") - console.print(f" Hybrid Search: ✓ (RRF fusion)") - else: - console.print(f" Fuzzy FTS: ✗ (run 'migrate' to enable)") - console.print(f" Hybrid Search: ✗ (run 'migrate' to enable)") - - if has_vector_search: - console.print(f" Vector Search: ✓ (embeddings available)") - else: - console.print(f" Vector Search: ✗ (no embeddings or coverage < 50%)") - - # Display embeddings statistics if available - if embeddings_info: - console.print("\n[bold]Embeddings Coverage:[/bold]") - console.print(f" Total Indexes: {embeddings_info['total_indexes']}") - console.print(f" Total Files: {embeddings_info['total_files']}") - console.print(f" Files with Embeddings: {embeddings_info['files_with_embeddings']}") - console.print(f" Coverage: {embeddings_info['coverage_percent']:.1f}%") - console.print(f" Total Chunks: {embeddings_info['total_chunks']}") - - # Display model information if available - model_info = embeddings_info.get('model_info') - if model_info: - console.print("\n[bold]Embedding Model:[/bold]") - console.print(f" Backend: [cyan]{model_info.get('backend', 'unknown')}[/cyan]") - console.print(f" Model: [cyan]{model_info.get('model_profile', 'unknown')}[/cyan] ({model_info.get('model_name', '')})") - console.print(f" Dimensions: {model_info.get('embedding_dim', 'unknown')}") - if model_info.get('updated_at'): - console.print(f" Last Updated: {model_info['updated_at']}") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Status failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Status failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def projects( - action: str = typer.Argument("list", help="Action: list, show, remove"), - project_path: Optional[Path] = typer.Argument(None, help="Project path (for show/remove)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Manage registered projects in the global registry. - - Actions: - - list: Show all registered projects - - show : Show details for a specific project - - remove : Remove a project from the registry - """ - _configure_logging(verbose, json_mode) - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - - if action == "list": - project_list = registry.list_projects() - if json_mode: - result = [ - { - "id": p.id, - "source_root": str(p.source_root), - "index_root": str(p.index_root), - "total_files": p.total_files, - "total_dirs": p.total_dirs, - "status": p.status, - } - for p in project_list - ] - print_json(success=True, result=result) - else: - if not project_list: - console.print("[yellow]No projects registered.[/yellow]") - else: - table = Table(title="Registered Projects") - table.add_column("ID", style="dim") - table.add_column("Source Root") - table.add_column("Files", justify="right") - table.add_column("Dirs", justify="right") - table.add_column("Status") - - for p in project_list: - table.add_row( - str(p.id), - str(p.source_root), - str(p.total_files), - str(p.total_dirs), - p.status, - ) - console.print(table) - - elif action == "show": - if not project_path: - raise typer.BadParameter("Project path required for 'show' action") - - project_path = project_path.expanduser().resolve() - project_info = registry.get_project(project_path) - - if not project_info: - if json_mode: - print_json(success=False, error=f"Project not found: {project_path}") - else: - console.print(f"[red]Project not found:[/red] {project_path}") - raise typer.Exit(code=1) - - if json_mode: - result = { - "id": project_info.id, - "source_root": str(project_info.source_root), - "index_root": str(project_info.index_root), - "total_files": project_info.total_files, - "total_dirs": project_info.total_dirs, - "status": project_info.status, - "created_at": project_info.created_at, - "last_indexed": project_info.last_indexed, - } - print_json(success=True, result=result) - else: - console.print(f"[bold]Project:[/bold] {project_info.source_root}") - console.print(f" ID: {project_info.id}") - console.print(f" Index Root: {project_info.index_root}") - console.print(f" Files: {project_info.total_files}") - console.print(f" Directories: {project_info.total_dirs}") - console.print(f" Status: {project_info.status}") - - # Show directory breakdown - dirs = registry.get_project_dirs(project_info.id) - if dirs: - console.print(f"\n [bold]Indexed Directories:[/bold] {len(dirs)}") - for d in dirs[:10]: - console.print(f" - {d.source_path.name}/ ({d.files_count} files)") - if len(dirs) > 10: - console.print(f" ... and {len(dirs) - 10} more") - - elif action == "remove": - if not project_path: - raise typer.BadParameter("Project path required for 'remove' action") - - project_path = project_path.expanduser().resolve() - removed = registry.unregister_project(project_path) - - if removed: - mapper = PathMapper() - index_root = mapper.source_to_index_dir(project_path) - if index_root.exists(): - shutil.rmtree(index_root) - - if json_mode: - print_json(success=True, result={"removed": str(project_path)}) - else: - console.print(f"[green]Removed:[/green] {project_path}") - else: - if json_mode: - print_json(success=False, error=f"Project not found: {project_path}") - else: - console.print(f"[yellow]Project not found:[/yellow] {project_path}") - - else: - raise typer.BadParameter(f"Unknown action: {action}. Use list, show, or remove.") - - except typer.BadParameter: - raise - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Projects command failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Projects command failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Projects command failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def config( - action: str = typer.Argument("show", help="Action: show, set, migrate"), - key: Optional[str] = typer.Argument(None, help="Config key (for set action)."), - value: Optional[str] = typer.Argument(None, help="Config value (for set action)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Manage CodexLens configuration. - - Actions: - - show: Display current configuration - - set : Set configuration value - - migrate : Migrate indexes to new location - - Config keys: - - index_dir: Directory to store indexes (default: ~/.codexlens/indexes) - - reranker_backend: Reranker backend (onnx, api, litellm, legacy) - - reranker_model: Reranker model name - - reranker_enabled: Enable reranking (true/false) - - reranker_top_k: Number of results to rerank - - reranker_api_provider: API provider for reranker (siliconflow, cohere, jina) - - embedding_backend: Embedding backend (fastembed, litellm) - - embedding_model: Embedding model profile or name - """ - _configure_logging(verbose, json_mode) - - config_file = Path.home() / ".codexlens" / "config.json" - - def load_config() -> Dict[str, Any]: - if config_file.exists(): - return json.loads(config_file.read_text(encoding="utf-8")) - return {} - - def save_config(cfg: Dict[str, Any]) -> None: - config_file.parent.mkdir(parents=True, exist_ok=True) - config_file.write_text(json.dumps(cfg, indent=2), encoding="utf-8") - - try: - if action == "show": - cfg = load_config() - current_index_dir = os.getenv("CODEXLENS_INDEX_DIR") or cfg.get("index_dir") or str(Path.home() / ".codexlens" / "indexes") - - result = { - "config_file": str(config_file), - "index_dir": current_index_dir, - "env_override": os.getenv("CODEXLENS_INDEX_DIR"), - } - - # Load settings.json for reranker and other runtime settings - settings_file = Path.home() / ".codexlens" / "settings.json" - if settings_file.exists(): - try: - settings = json.loads(settings_file.read_text(encoding="utf-8")) - # Extract reranker settings (flat keys for CCW compatibility) - reranker = settings.get("reranker", {}) - if reranker.get("backend"): - result["reranker_backend"] = reranker["backend"] - if reranker.get("model"): - result["reranker_model"] = reranker["model"] - if reranker.get("enabled") is not None: - result["reranker_enabled"] = reranker["enabled"] - if reranker.get("top_k"): - result["reranker_top_k"] = reranker["top_k"] - if reranker.get("api_provider"): - result["reranker_api_provider"] = reranker["api_provider"] - # Extract embedding settings - embedding = settings.get("embedding", {}) - if embedding.get("backend"): - result["embedding_backend"] = embedding["backend"] - if embedding.get("model"): - result["embedding_model"] = embedding["model"] - except (json.JSONDecodeError, OSError): - pass # Settings file not readable, continue with defaults - - # Load .env overrides from global ~/.codexlens/.env - env_overrides: Dict[str, str] = {} - try: - from codexlens.env_config import load_global_env - env_overrides = load_global_env() - except ImportError: - pass - - # Apply .env overrides (highest priority) and track them - if env_overrides.get("EMBEDDING_MODEL"): - result["embedding_model"] = env_overrides["EMBEDDING_MODEL"] - result["embedding_model_source"] = ".env" - if env_overrides.get("EMBEDDING_BACKEND"): - result["embedding_backend"] = env_overrides["EMBEDDING_BACKEND"] - result["embedding_backend_source"] = ".env" - if env_overrides.get("RERANKER_MODEL"): - result["reranker_model"] = env_overrides["RERANKER_MODEL"] - result["reranker_model_source"] = ".env" - if env_overrides.get("RERANKER_BACKEND"): - result["reranker_backend"] = env_overrides["RERANKER_BACKEND"] - result["reranker_backend_source"] = ".env" - if env_overrides.get("RERANKER_ENABLED"): - result["reranker_enabled"] = env_overrides["RERANKER_ENABLED"].lower() in ("true", "1", "yes", "on") - result["reranker_enabled_source"] = ".env" - if env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER"): - result["reranker_api_provider"] = env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER") - - if json_mode: - print_json(success=True, result=result) - else: - console.print("[bold]CodexLens Configuration[/bold]") - console.print(f" Config File: {result['config_file']}") - console.print(f" Index Directory: {result['index_dir']}") - if result['env_override']: - console.print(f" [dim](Override via CODEXLENS_INDEX_DIR)[/dim]") - - # Show embedding settings - console.print(f"\n[bold]Embedding[/bold]") - backend = result.get('embedding_backend', 'fastembed') - backend_source = result.get('embedding_backend_source', 'settings.json') - console.print(f" Backend: {backend} [dim]({backend_source})[/dim]") - model = result.get('embedding_model', 'code') - model_source = result.get('embedding_model_source', 'settings.json') - console.print(f" Model: {model} [dim]({model_source})[/dim]") - - # Show reranker settings - console.print(f"\n[bold]Reranker[/bold]") - backend = result.get('reranker_backend', 'fastembed') - backend_source = result.get('reranker_backend_source', 'settings.json') - console.print(f" Backend: {backend} [dim]({backend_source})[/dim]") - model = result.get('reranker_model', 'N/A') - model_source = result.get('reranker_model_source', 'settings.json') - console.print(f" Model: {model} [dim]({model_source})[/dim]") - enabled = result.get('reranker_enabled', False) - enabled_source = result.get('reranker_enabled_source', 'settings.json') - console.print(f" Enabled: {enabled} [dim]({enabled_source})[/dim]") - - elif action == "set": - if not key: - raise typer.BadParameter("Config key required for 'set' action") - if not value: - raise typer.BadParameter("Config value required for 'set' action") - - cfg = load_config() - - if key == "index_dir": - new_path = Path(value).expanduser().resolve() - cfg["index_dir"] = str(new_path) - save_config(cfg) - - if json_mode: - print_json(success=True, result={"key": key, "value": str(new_path)}) - else: - console.print(f"[green]Set {key}=[/green] {new_path}") - console.print("[yellow]Note: Existing indexes remain at old location. Use 'config migrate' to move them.[/yellow]") - - # Handle reranker and embedding settings (stored in settings.json) - elif key in ("reranker_backend", "reranker_model", "reranker_enabled", "reranker_top_k", - "embedding_backend", "embedding_model", "reranker_api_provider"): - settings_file = Path.home() / ".codexlens" / "settings.json" - settings_file.parent.mkdir(parents=True, exist_ok=True) - - # Load existing settings - settings: Dict[str, Any] = {} - if settings_file.exists(): - try: - settings = json.loads(settings_file.read_text(encoding="utf-8")) - except (json.JSONDecodeError, OSError): - pass - - # Ensure nested structures exist - if "reranker" not in settings: - settings["reranker"] = {} - if "embedding" not in settings: - settings["embedding"] = {} - - # Map flat keys to nested structure - if key == "reranker_backend": - settings["reranker"]["backend"] = value - elif key == "reranker_model": - settings["reranker"]["model"] = value - elif key == "reranker_enabled": - settings["reranker"]["enabled"] = value.lower() in ("true", "1", "yes") - elif key == "reranker_top_k": - settings["reranker"]["top_k"] = int(value) - elif key == "reranker_api_provider": - settings["reranker"]["api_provider"] = value - elif key == "embedding_backend": - settings["embedding"]["backend"] = value - elif key == "embedding_model": - settings["embedding"]["model"] = value - - # Save settings - settings_file.write_text(json.dumps(settings, indent=2), encoding="utf-8") - - if json_mode: - print_json(success=True, result={"key": key, "value": value}) - else: - console.print(f"[green]Set {key}=[/green] {value}") - else: - raise typer.BadParameter(f"Unknown config key: {key}") - - elif action == "migrate": - if not key: - raise typer.BadParameter("New path required for 'migrate' action") - - new_path = Path(key).expanduser().resolve() - mapper = PathMapper() - old_path = mapper.index_root - - if not old_path.exists(): - if json_mode: - print_json(success=False, error="No indexes to migrate") - else: - console.print("[yellow]No indexes to migrate.[/yellow]") - return - - # Create new directory - new_path.mkdir(parents=True, exist_ok=True) - - # Count items to migrate - items = list(old_path.iterdir()) - migrated = 0 - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("{task.completed}/{task.total}"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Migrating indexes", total=len(items)) - - for item in items: - dest = new_path / item.name - if item.is_dir(): - shutil.copytree(item, dest, dirs_exist_ok=True) - else: - shutil.copy2(item, dest) - migrated += 1 - progress.advance(task) - - # Update config - cfg = load_config() - cfg["index_dir"] = str(new_path) - save_config(cfg) - - # Update registry paths - registry = RegistryStore() - registry.initialize() - registry.update_index_paths(old_path, new_path) - registry.close() - - result = { - "migrated_from": str(old_path), - "migrated_to": str(new_path), - "items_migrated": migrated, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Migrated {migrated} items to:[/green] {new_path}") - console.print("[dim]Old indexes can be manually deleted after verifying migration.[/dim]") - - else: - raise typer.BadParameter(f"Unknown action: {action}. Use show, set, or migrate.") - - except typer.BadParameter: - raise - except ConfigError as exc: - if json_mode: - print_json(success=False, error=f"Configuration error: {exc}") - else: - console.print(f"[red]Config command failed (config):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Config command failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Config command failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Config command failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command() -def migrate( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Migrate project indexes to latest schema (Dual-FTS upgrade). - - Upgrades all _index.db files in the project to schema version 4, which includes: - - Dual FTS tables (exact + fuzzy) - - Encoding detection support - - Incremental indexing metadata - - This is a safe operation that preserves all existing data. - Progress is shown during migration. - """ - _configure_logging(verbose, json_mode) - base_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Find project - project_info = registry.get_project(base_path) - if not project_info: - raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.") - - index_dir = mapper.source_to_index_dir(base_path) - if not index_dir.exists(): - raise CodexLensError(f"Index directory not found: {index_dir}") - - # Find all _index.db files - index_files = list(index_dir.rglob("_index.db")) - - if not index_files: - if json_mode: - print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0}) - else: - console.print("[yellow]No indexes found to migrate.[/yellow]") - return - - migrated_count = 0 - error_count = 0 - already_migrated = 0 - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("({task.completed}/{task.total})"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files)) - - for db_path in index_files: - try: - store = DirIndexStore(db_path) - - # Check current version - with store._lock: - conn = store._get_connection() - current_version = store._get_schema_version(conn) - - if current_version >= DirIndexStore.SCHEMA_VERSION: - already_migrated += 1 - if verbose: - progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]") - elif current_version > 0: - # Apply migrations - store._apply_migrations(conn, current_version) - store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION) - conn.commit() - migrated_count += 1 - if verbose: - progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]") - else: - # New database, initialize directly - store.initialize() - migrated_count += 1 - - store.close() - - except Exception as e: - error_count += 1 - if verbose: - progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]") - - progress.update(task, advance=1) - - result = { - "path": str(base_path), - "total_indexes": len(index_files), - "migrated": migrated_count, - "already_migrated": already_migrated, - "errors": error_count, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Migration complete:[/green]") - console.print(f" Total indexes: {len(index_files)}") - console.print(f" Migrated: {migrated_count}") - console.print(f" Already up-to-date: {already_migrated}") - if error_count > 0: - console.print(f" [yellow]Errors: {error_count}[/yellow]") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Migration failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Migration failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def clean( - path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."), - all_indexes: bool = typer.Option(False, "--all", "-a", help="Remove all indexes."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Remove CodexLens index data. - - Without arguments, shows current index size. - With path, removes that project's indexes. - With --all, removes all indexes (use with caution). - """ - _configure_logging(verbose, json_mode) - - try: - mapper = PathMapper() - index_root = mapper.index_root - - if all_indexes: - # Remove everything - if not index_root.exists(): - if json_mode: - print_json(success=True, result={"cleaned": None, "message": "No indexes to clean"}) - else: - console.print("[yellow]No indexes to clean.[/yellow]") - return - - # Calculate size before removal - total_size = 0 - for f in index_root.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - # Remove registry first - registry_path = _get_registry_path() - if registry_path.exists(): - registry_path.unlink() - - # Remove all indexes - shutil.rmtree(index_root) - - result = { - "cleaned": str(index_root), - "size_freed_mb": round(total_size / (1024 * 1024), 2), - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Removed all indexes:[/green] {result['size_freed_mb']} MB freed") - - elif path: - # Remove specific project - project_path = path.expanduser().resolve() - project_index = mapper.source_to_index_dir(project_path) - - if not project_index.exists(): - if json_mode: - print_json(success=False, error=f"No index found for: {project_path}") - else: - console.print(f"[yellow]No index found for:[/yellow] {project_path}") - return - - # Calculate size - total_size = 0 - for f in project_index.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - # Remove from registry - registry = RegistryStore() - registry.initialize() - registry.unregister_project(project_path) - registry.close() - - # Remove indexes - shutil.rmtree(project_index) - - result = { - "cleaned": str(project_path), - "index_path": str(project_index), - "size_freed_mb": round(total_size / (1024 * 1024), 2), - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Removed indexes for:[/green] {project_path}") - console.print(f" Freed: {result['size_freed_mb']} MB") - - else: - # Show current status - if not index_root.exists(): - if json_mode: - print_json(success=True, result={"index_root": str(index_root), "exists": False}) - else: - console.print("[yellow]No indexes found.[/yellow]") - return - - total_size = 0 - for f in index_root.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - registry = RegistryStore() - registry.initialize() - projects = registry.list_projects() - registry.close() - - result = { - "index_root": str(index_root), - "projects_count": len(projects), - "total_size_mb": round(total_size / (1024 * 1024), 2), - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print("[bold]Index Status[/bold]") - console.print(f" Location: {result['index_root']}") - console.print(f" Projects: {result['projects_count']}") - console.print(f" Total Size: {result['total_size_mb']} MB") - console.print("\n[dim]Use 'clean ' to remove a specific project or 'clean --all' to remove everything.[/dim]") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Clean failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Clean failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Clean failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command("semantic-list") -def semantic_list( - path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."), - offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."), - limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."), - tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """List semantic metadata entries for indexed files. - - Shows files that have LLM-generated summaries and keywords. - Results are aggregated from all index databases in the project. - """ - _configure_logging(verbose, json_mode) - base_path = path.expanduser().resolve() - - registry: Optional[RegistryStore] = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - project_info = registry.get_project(base_path) - if not project_info: - raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.") - - index_dir = Path(project_info.index_root) - if not index_dir.exists(): - raise CodexLensError(f"Index directory not found: {index_dir}") - - all_results: list = [] - total_count = 0 - - index_files = sorted(index_dir.rglob("_index.db")) - - for db_path in index_files: - try: - store = DirIndexStore(db_path) - store.initialize() - - results, count = store.list_semantic_metadata( - offset=0, - limit=1000, - llm_tool=tool_filter, - ) - - source_dir = mapper.index_to_source(db_path.parent) - for r in results: - r["source_dir"] = str(source_dir) - - all_results.extend(results) - total_count += count - - store.close() - except Exception as e: - if verbose: - console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]") - - all_results.sort(key=lambda x: x["generated_at"], reverse=True) - paginated = all_results[offset : offset + limit] - - result = { - "path": str(base_path), - "total": total_count, - "offset": offset, - "limit": limit, - "count": len(paginated), - "entries": paginated, - } - - if json_mode: - print_json(success=True, result=result) - else: - if not paginated: - console.print("[yellow]No semantic metadata found.[/yellow]") - console.print("Run 'codex-lens enhance' to generate metadata for indexed files.") - else: - table = Table(title=f"Semantic Metadata ({total_count} total)") - table.add_column("File", style="cyan", max_width=40) - table.add_column("Language", style="dim") - table.add_column("Purpose", max_width=30) - table.add_column("Keywords", max_width=25) - table.add_column("Tool") - - for entry in paginated: - keywords_str = ", ".join(entry["keywords"][:3]) - if len(entry["keywords"]) > 3: - keywords_str += f" (+{len(entry['keywords']) - 3})" - - table.add_row( - entry["file_name"], - entry["language"] or "-", - (entry["purpose"] or "-")[:30], - keywords_str or "-", - entry["llm_tool"] or "-", - ) - - console.print(table) - - if total_count > len(paginated): - console.print( - f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. " - "Use --offset and --limit for pagination.[/dim]" - ) - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Semantic-list failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Semantic-list failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -# ==================== Model Management Commands ==================== - -@app.command(name="model-list") -def model_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available embedding models and their installation status. - - Shows 4 model profiles (fast, code, multilingual, balanced) with: - - Installation status - - Model size and dimensions - - Use case recommendations - """ - try: - from codexlens.cli.model_manager import list_models - - result = list_models() - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - models = data["models"] - cache_dir = data["cache_dir"] - cache_exists = data["cache_exists"] - - console.print("[bold]Available Embedding Models:[/bold]") - console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("Profile", style="cyan") - table.add_column("Model Name", style="blue") - table.add_column("Dims", justify="right") - table.add_column("Size (MB)", justify="right") - table.add_column("Status", justify="center") - table.add_column("Use Case", style="dim") - - for model in models: - status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]" - size_display = ( - f"{model['actual_size_mb']:.1f}" if model["installed"] - else f"~{model['estimated_size_mb']}" - ) - table.add_row( - model["profile"], - model["model_name"], - str(model["dimensions"]), - size_display, - status_icon, - model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"], - ) - - console.print(table) - console.print("\n[dim]Use 'codexlens model-download ' to download a model[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-download") -def model_download( - profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download an embedding model by profile name. - - Example: - codexlens model-download code # Download code-optimized model - """ - try: - from codexlens.cli.model_manager import download_model - - if not json_mode: - console.print(f"[bold]Downloading model:[/bold] {profile}") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - # Create progress callback for non-JSON mode - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_model(profile, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Model downloaded successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-delete") -def model_delete( - profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Delete a downloaded embedding model from cache. - - Example: - codexlens model-delete fast # Delete fast model - """ - from codexlens.cli.model_manager import delete_model - - if not json_mode: - console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}") - - result = delete_model(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Model deleted successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB") - - -@app.command(name="model-download-custom") -def model_download_custom( - model_name: str = typer.Argument(..., help="Full HuggingFace model name (e.g., BAAI/bge-small-en-v1.5)."), - model_type: str = typer.Option("embedding", "--type", help="Model type: embedding or reranker."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download a custom HuggingFace model by name. - - This allows downloading any fastembed-compatible model from HuggingFace. - - Example: - codexlens model-download-custom BAAI/bge-small-en-v1.5 - codexlens model-download-custom BAAI/bge-reranker-base --type reranker - """ - try: - from codexlens.cli.model_manager import download_custom_model - - if not json_mode: - console.print(f"[bold]Downloading custom model:[/bold] {model_name}") - console.print(f"[dim]Model type: {model_type}[/dim]") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_custom_model(model_name, model_type=model_type, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Custom model downloaded successfully!") - console.print(f" Model: {data['model_name']}") - console.print(f" Type: {data['model_type']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-info") -def model_info( - profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Get detailed information about a model profile. - - Example: - codexlens model-info code # Get code model details - """ - from codexlens.cli.model_manager import get_model_info - - result = get_model_info(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[bold]Model Profile:[/bold] {data['profile']}") - console.print(f" Model name: {data['model_name']}") - console.print(f" Dimensions: {data['dimensions']}") - console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}") - if data['installed'] and data['actual_size_mb']: - console.print(f" Cache size: {data['actual_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - else: - console.print(f" Estimated size: ~{data['estimated_size_mb']} MB") - console.print(f"\n Description: {data['description']}") - console.print(f" Use case: {data['use_case']}") - - -# ==================== Reranker Model Management Commands ==================== - - -@app.command(name="reranker-model-list") -def reranker_model_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available reranker models and their installation status. - - Shows reranker model profiles with: - - Installation status - - Model size - - Use case recommendations - """ - try: - from codexlens.cli.model_manager import list_reranker_models - - result = list_reranker_models() - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - models = data["models"] - cache_dir = data["cache_dir"] - cache_exists = data["cache_exists"] - - console.print("[bold]Available Reranker Models:[/bold]") - console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("Profile", style="cyan") - table.add_column("Model", style="dim") - table.add_column("Size", justify="right") - table.add_column("Status") - table.add_column("Description") - - for m in models: - status = "[green]✓ Installed[/green]" if m["installed"] else "[dim]Not installed[/dim]" - size = f"{m['actual_size_mb']:.1f} MB" if m["installed"] and m["actual_size_mb"] else f"~{m['estimated_size_mb']} MB" - rec = " [yellow]★[/yellow]" if m.get("recommended") else "" - table.add_row(m["profile"] + rec, m["model_name"], size, status, m["description"]) - - console.print(table) - console.print("\n[yellow]★[/yellow] = Recommended") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0") - else: - console.print("[red]Error:[/red] fastembed reranker not available") - console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]") - raise typer.Exit(code=1) - - -@app.command(name="reranker-model-download") -def reranker_model_download( - profile: str = typer.Argument(..., help="Reranker model profile to download."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download a reranker model by profile name. - - Example: - codexlens reranker-model-download ms-marco-mini # Download default reranker - """ - try: - from codexlens.cli.model_manager import download_reranker_model - - if not json_mode: - console.print(f"[bold]Downloading reranker model:[/bold] {profile}") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_reranker_model(profile, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Reranker model downloaded successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0") - else: - console.print("[red]Error:[/red] fastembed reranker not available") - console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]") - raise typer.Exit(code=1) - - -@app.command(name="reranker-model-delete") -def reranker_model_delete( - profile: str = typer.Argument(..., help="Reranker model profile to delete."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Delete a downloaded reranker model from cache. - - Example: - codexlens reranker-model-delete ms-marco-mini # Delete reranker model - """ - from codexlens.cli.model_manager import delete_reranker_model - - if not json_mode: - console.print(f"[bold yellow]Deleting reranker model:[/bold yellow] {profile}") - - result = delete_reranker_model(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Reranker model deleted successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB") - - -@app.command(name="reranker-model-info") -def reranker_model_info( - profile: str = typer.Argument(..., help="Reranker model profile to get info."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Get detailed information about a reranker model profile. - - Example: - codexlens reranker-model-info ms-marco-mini # Get reranker model details - """ - from codexlens.cli.model_manager import get_reranker_model_info - - result = get_reranker_model_info(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[bold]Reranker Model Profile:[/bold] {data['profile']}") - console.print(f" Model name: {data['model_name']}") - console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}") - if data['installed'] and data['actual_size_mb']: - console.print(f" Cache size: {data['actual_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - else: - console.print(f" Estimated size: ~{data['estimated_size_mb']} MB") - console.print(f" Recommended: {'[green]Yes[/green]' if data.get('recommended') else '[dim]No[/dim]'}") - console.print(f"\n Description: {data['description']}") - console.print(f" Use case: {data['use_case']}") - - -# ==================== Embedding Management Commands ==================== - -@app.command(name="embeddings-status", hidden=True, deprecated=True) -def embeddings_status( - path: Optional[Path] = typer.Argument( - None, - exists=True, - help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """[Deprecated] Use 'codexlens index status' instead. - - Check embedding status for one or all indexes. - - Shows embedding statistics including: - - Number of chunks generated - - File coverage percentage - - Files missing embeddings - - Examples: - codexlens embeddings-status # Check all indexes - codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index - codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index) - """ - _deprecated_command_warning("embeddings-status", "index status") - from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary - - # Determine what to check - if path is None: - # Check all indexes in default root - index_root = _get_index_root() - result = get_embedding_stats_summary(index_root) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - total = data["total_indexes"] - with_emb = data["indexes_with_embeddings"] - total_chunks = data["total_chunks"] - - console.print(f"[bold]Embedding Status Summary[/bold]") - console.print(f"Index root: [dim]{index_root}[/dim]\n") - console.print(f"Total indexes: {total}") - console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}") - console.print(f"Total chunks: {total_chunks:,}\n") - - if data["indexes"]: - table = Table(show_header=True, header_style="bold") - table.add_column("Project", style="cyan") - table.add_column("Files", justify="right") - table.add_column("Chunks", justify="right") - table.add_column("Coverage", justify="right") - table.add_column("Status", justify="center") - - for idx_stat in data["indexes"]: - status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]" - coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else "—" - - table.add_row( - idx_stat["project"], - str(idx_stat["total_files"]), - f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0", - coverage, - status_icon, - ) - - console.print(table) - - else: - # Check specific index or find index for project - target_path = path.expanduser().resolve() - - if target_path.is_file() and target_path.name == "_index.db": - # Direct index file - index_path = target_path - elif target_path.is_dir(): - # Try to find index for this project - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - - if not index_path.exists(): - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - result = check_index_embeddings(index_path) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - has_emb = data["has_embeddings"] - - console.print(f"[bold]Embedding Status[/bold]") - console.print(f"Index: [dim]{data['index_path']}[/dim]\n") - - if has_emb: - console.print(f"[green]✓[/green] Embeddings available") - console.print(f" Total chunks: {data['total_chunks']:,}") - console.print(f" Total files: {data['total_files']:,}") - console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}") - console.print(f" Coverage: {data['coverage_percent']:.1f}%") - - if data["files_without_chunks"] > 0: - console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings") - if data["missing_files_sample"]: - console.print(" Sample missing files:") - for file in data["missing_files_sample"]: - console.print(f" [dim]{file}[/dim]") - else: - console.print(f"[yellow]—[/yellow] No embeddings found") - console.print(f" Total files indexed: {data['total_files']:,}") - console.print("\n[dim]Generate embeddings with:[/dim]") - console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]") - - -@index_app.command("embeddings") -def index_embeddings( - path: Path = typer.Argument( - ..., - exists=True, - help="Path to _index.db file or project directory.", - ), - backend: str = typer.Option( - "fastembed", - "--backend", - "-b", - help="Embedding backend: fastembed (local) or litellm (remote API).", - ), - model: str = typer.Option( - "code", - "--model", - "-m", - help="Model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small).", - ), - force: bool = typer.Option( - False, - "--force", - "-f", - help="Force regeneration even if embeddings exist.", - ), - chunk_size: int = typer.Option( - 2000, - "--chunk-size", - help="Maximum chunk size in characters.", - ), - max_workers: int = typer.Option( - 1, - "--max-workers", - "-w", - min=1, - help="Max concurrent API calls. Recommended: 4-8 for litellm backend. Default: 1 (sequential).", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), - centralized: bool = typer.Option( - True, - "--centralized/--distributed", - "-c/-d", - help="Use centralized vector storage (default) or distributed per-directory indexes.", - ), -) -> None: - """Generate semantic embeddings for code search. - - Creates vector embeddings for all files in an index to enable - semantic search capabilities. Embeddings are stored in the same - database as the FTS index. - - Storage Modes: - - Default: Per-directory HNSW indexes alongside _index.db files - - Centralized: Single HNSW index at project root (_vectors.hnsw) - - Embedding Backend Options: - - fastembed: Local ONNX-based embeddings (default, no API calls) - - litellm: Remote API embeddings via ccw-litellm (requires API keys) - - Model Options: - For fastembed backend (profiles): - - fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB) - - code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended] - - multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB) - - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB) - - For litellm backend (model names): - - text-embedding-3-small, text-embedding-3-large (OpenAI) - - text-embedding-ada-002 (OpenAI legacy) - - Any model supported by ccw-litellm - - Examples: - codexlens index embeddings ~/projects/my-app # Auto-find index (fastembed, code profile) - codexlens index embeddings ~/.codexlens/indexes/project/_index.db # Specific index - codexlens index embeddings ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM - codexlens index embeddings ~/projects/my-app --model fast --force # Regenerate with fast profile - codexlens index embeddings ~/projects/my-app --centralized # Centralized vector storage - """ - _configure_logging(verbose, json_mode) - - from codexlens.cli.embedding_manager import ( - generate_embeddings, - generate_dense_embeddings_centralized, - scan_for_model_conflicts, - check_global_model_lock, - set_locked_model_config, - ) - - # Validate backend - valid_backends = ["fastembed", "litellm"] - if backend not in valid_backends: - error_msg = f"Invalid backend: {backend}. Must be one of: {', '.join(valid_backends)}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print(f"[dim]Valid backends: {', '.join(valid_backends)}[/dim]") - raise typer.Exit(code=1) - - # Resolve path - target_path = path.expanduser().resolve() - - # Determine index path or root for centralized mode - index_path = None - index_root = None - - if target_path.is_file() and target_path.name == "_index.db": - # Direct index file - index_path = target_path - index_root = target_path.parent - elif target_path.is_dir(): - # Directory: Find index location from registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - - if not index_path.exists(): - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - index_root = index_path.parent # Use index directory for both modes - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - # Progress callback - def progress_update(msg: str): - if not json_mode and verbose: - console.print(f" {msg}") - - console.print(f"[bold]Generating embeddings[/bold]") - if centralized: - effective_root = index_root if index_root else (index_path.parent if index_path else target_path) - console.print(f"Index root: [dim]{effective_root}[/dim]") - console.print(f"Mode: [green]Centralized[/green]") - else: - console.print(f"Index: [dim]{index_path}[/dim]") - console.print(f"Backend: [cyan]{backend}[/cyan]") - console.print(f"Model: [cyan]{model}[/cyan]") - if max_workers > 1: - console.print(f"Concurrency: [cyan]{max_workers} workers[/cyan]") - console.print() - - # Check global model lock (prevents mixing different models) - if not force: - lock_result = check_global_model_lock(backend, model) - if lock_result["has_conflict"]: - locked = lock_result["locked_config"] - if json_mode: - print_json( - success=False, - error="Global model lock conflict", - code="MODEL_LOCKED", - locked_config=locked, - target_config=lock_result["target_config"], - hint="Use --force to override the lock and switch to a different model (will regenerate all embeddings)", - ) - raise typer.Exit(code=1) - else: - console.print("[red]⛔ Global Model Lock Active[/red]") - console.print(f" Locked model: [cyan]{locked['backend']}/{locked['model']}[/cyan]") - console.print(f" Requested: [yellow]{backend}/{model}[/yellow]") - console.print(f" Locked at: {locked.get('locked_at', 'unknown')}") - console.print() - console.print("[dim]All indexes must use the same embedding model.[/dim]") - console.print("[dim]Use --force to switch models (will regenerate all embeddings).[/dim]") - raise typer.Exit(code=1) - - # Pre-check for model conflicts (only if not forcing) - if not force: - # Determine the index root for conflict scanning - scan_root = index_root if index_root else (index_path.parent if index_path else None) - - if scan_root: - conflict_result = scan_for_model_conflicts(scan_root, backend, model) - - if conflict_result["has_conflict"]: - existing = conflict_result["existing_config"] - conflict_count = len(conflict_result["conflicts"]) - - if json_mode: - # JSON mode: return structured error for UI handling - print_json( - success=False, - error="Model conflict detected", - code="MODEL_CONFLICT", - existing_config=existing, - target_config=conflict_result["target_config"], - conflict_count=conflict_count, - conflicts=conflict_result["conflicts"][:5], # Show first 5 conflicts - hint="Use --force to overwrite existing embeddings with the new model", - ) - raise typer.Exit(code=1) - else: - # Interactive mode: show warning and ask for confirmation - console.print("[yellow]⚠ Model Conflict Detected[/yellow]") - console.print(f" Existing: [red]{existing['backend']}/{existing['model']}[/red] ({existing.get('embedding_dim', '?')} dim)") - console.print(f" Requested: [green]{backend}/{model}[/green]") - console.print(f" Affected indexes: [yellow]{conflict_count}[/yellow]") - console.print() - console.print("[dim]Mixing different embedding models in the same index is not supported.[/dim]") - console.print("[dim]Overwriting will delete all existing embeddings and regenerate with the new model.[/dim]") - console.print() - - # Ask for confirmation - if typer.confirm("Overwrite existing embeddings with the new model?", default=False): - force = True - console.print("[green]Confirmed.[/green] Proceeding with overwrite...\n") - else: - console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.") - raise typer.Exit(code=0) - - if centralized: - # Centralized mode: single HNSW index at project root - if not index_root: - index_root = index_path.parent if index_path else target_path - result = generate_dense_embeddings_centralized( - index_root, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=chunk_size, - progress_callback=progress_update, - max_workers=max_workers, - ) - else: - result = generate_embeddings( - index_path, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=chunk_size, - progress_callback=progress_update, - max_workers=max_workers, - ) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - error_msg = result.get("error", "Unknown error") - console.print(f"[red]Error:[/red] {error_msg}") - - # Provide helpful hints - if "already has" in error_msg: - console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]") - elif "fastembed not available" in error_msg or "Semantic search not available" in error_msg: - console.print("\n[dim]Install semantic dependencies:[/dim]") - console.print(" [cyan]pip install codexlens[semantic][/cyan]") - elif "ccw-litellm not available" in error_msg: - console.print("\n[dim]Install LiteLLM backend dependencies:[/dim]") - console.print(" [cyan]pip install ccw-litellm[/cyan]") - - raise typer.Exit(code=1) - - data = result["result"] - - # Set global model lock after successful generation - # This prevents using different models for future indexes - set_locked_model_config(backend, model) - - if centralized: - # Centralized mode output - elapsed = data.get("elapsed_time", 0) - console.print(f"[green]v[/green] Centralized embeddings generated successfully!") - console.print(f" Model: {data.get('model_name', model)}") - console.print(f" Chunks created: {data['chunks_created']:,}") - console.print(f" Files processed: {data['files_processed']}") - if data.get("files_failed", 0) > 0: - console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]") - console.print(f" Central index: {data.get('central_index_path', 'N/A')}") - console.print(f" Time: {elapsed:.1f}s") - else: - # Single index mode output - elapsed = data["elapsed_time"] - - console.print(f"[green]v[/green] Embeddings generated successfully!") - console.print(f" Model: {data['model_name']}") - console.print(f" Chunks created: {data['chunks_created']:,}") - console.print(f" Files processed: {data['files_processed']}") - - if data["files_failed"] > 0: - console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]") - if data["failed_files"]: - console.print(" [dim]First failures:[/dim]") - for file_path, error in data["failed_files"]: - console.print(f" [dim]{file_path}: {error}[/dim]") - - console.print(f" Time: {elapsed:.1f}s") - - console.print("\n[dim]Use vector search with:[/dim]") - console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]") - - -# ==================== GPU Management Commands ==================== - -@app.command(name="gpu-list") -def gpu_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available GPU devices for embedding acceleration. - - Shows all detected GPU devices with their capabilities and selection status. - Discrete GPUs (NVIDIA, AMD) are automatically preferred over integrated GPUs. - - Examples: - codexlens gpu-list # List all GPUs - codexlens gpu-list --json # JSON output for scripting - """ - from codexlens.semantic.gpu_support import get_gpu_devices, detect_gpu, get_selected_device_id - - gpu_info = detect_gpu() - devices = get_gpu_devices() - selected_id = get_selected_device_id() - - if json_mode: - print_json( - success=True, - result={ - "devices": devices, - "selected_device_id": selected_id, - "gpu_available": gpu_info.gpu_available, - "providers": gpu_info.onnx_providers, - } - ) - else: - if not devices: - console.print("[yellow]No GPU devices detected[/yellow]") - console.print(f"ONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]") - return - - console.print("[bold]Available GPU Devices[/bold]\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("ID", justify="center") - table.add_column("Name") - table.add_column("Vendor", justify="center") - table.add_column("Type", justify="center") - table.add_column("Status", justify="center") - - for dev in devices: - type_str = "[green]Discrete[/green]" if dev["is_discrete"] else "[dim]Integrated[/dim]" - vendor_color = { - "nvidia": "green", - "amd": "red", - "intel": "blue" - }.get(dev["vendor"], "white") - vendor_str = f"[{vendor_color}]{dev['vendor'].upper()}[/{vendor_color}]" - - status_parts = [] - if dev["is_preferred"]: - status_parts.append("[cyan]Auto[/cyan]") - if dev["is_selected"]: - status_parts.append("[green]✓ Selected[/green]") - - status_str = " ".join(status_parts) if status_parts else "[dim]—[/dim]" - - table.add_row( - str(dev["device_id"]), - dev["name"], - vendor_str, - type_str, - status_str, - ) - - console.print(table) - console.print(f"\nONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]") - console.print("\n[dim]Select GPU with:[/dim]") - console.print(" [cyan]codexlens gpu-select [/cyan]") - - -@app.command(name="gpu-select") -def gpu_select( - device_id: int = typer.Argument( - ..., - help="GPU device ID to use for embeddings. Use 'codexlens gpu-list' to see available IDs.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Select a specific GPU device for embedding generation. - - By default, CodexLens automatically selects the most powerful GPU (discrete over integrated). - Use this command to override the selection. - - Examples: - codexlens gpu-select 1 # Use GPU device 1 - codexlens gpu-select 0 --json # Select GPU 0 with JSON output - """ - from codexlens.semantic.gpu_support import set_selected_device_id, get_gpu_devices - from codexlens.semantic.embedder import clear_embedder_cache - - devices = get_gpu_devices() - valid_ids = [dev["device_id"] for dev in devices] - - if device_id not in valid_ids: - if json_mode: - print_json(success=False, error=f"Invalid device_id {device_id}. Valid IDs: {valid_ids}") - else: - console.print(f"[red]Error:[/red] Invalid device_id {device_id}") - console.print(f"Valid IDs: {valid_ids}") - console.print("\n[dim]Use 'codexlens gpu-list' to see available devices[/dim]") - raise typer.Exit(code=1) - - success = set_selected_device_id(device_id) - - if success: - # Clear embedder cache to force reload with new GPU - clear_embedder_cache() - - device_name = next((dev["name"] for dev in devices if dev["device_id"] == device_id), "Unknown") - - if json_mode: - print_json( - success=True, - result={ - "device_id": device_id, - "device_name": device_name, - "message": f"GPU selection set to device {device_id}: {device_name}", - } - ) - else: - console.print(f"[green]✓[/green] GPU selection updated") - console.print(f" Device ID: {device_id}") - console.print(f" Device: [cyan]{device_name}[/cyan]") - console.print("\n[dim]New embeddings will use this GPU[/dim]") - else: - if json_mode: - print_json(success=False, error="Failed to set GPU selection") - else: - console.print("[red]Error:[/red] Failed to set GPU selection") - raise typer.Exit(code=1) - - -@app.command(name="gpu-reset") -def gpu_reset( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Reset GPU selection to automatic detection. - - Clears any manual GPU selection and returns to automatic selection - (discrete GPU preferred over integrated). - - Examples: - codexlens gpu-reset # Reset to auto-detection - """ - from codexlens.semantic.gpu_support import set_selected_device_id, detect_gpu - from codexlens.semantic.embedder import clear_embedder_cache - - set_selected_device_id(None) - clear_embedder_cache() - - gpu_info = detect_gpu(force_refresh=True) - - if json_mode: - print_json( - success=True, - result={ - "message": "GPU selection reset to auto-detection", - "preferred_device_id": gpu_info.preferred_device_id, - "preferred_device_name": gpu_info.gpu_name, - } - ) - else: - console.print("[green]✓[/green] GPU selection reset to auto-detection") - if gpu_info.preferred_device_id is not None: - console.print(f" Auto-selected device: {gpu_info.preferred_device_id}") - console.print(f" Device: [cyan]{gpu_info.gpu_name}[/cyan]") - - - -# ==================== SPLADE Commands ==================== - -@index_app.command("splade") -def index_splade( - path: Path = typer.Argument(..., help="Project path to index"), - rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), -) -> None: - """Generate SPLADE sparse index for existing codebase. - - Encodes all semantic chunks with SPLADE model and builds inverted index - for efficient sparse retrieval. - - This command discovers all _index.db files recursively in the project's - index directory and builds SPLADE encodings for chunks across all of them. - - Examples: - codexlens index splade ~/projects/my-app - codexlens index splade . --rebuild - """ - _configure_logging(verbose) - - from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available - from codexlens.storage.splade_index import SpladeIndex - from codexlens.semantic.vector_store import VectorStore - - # Check SPLADE availability - ok, err = check_splade_available() - if not ok: - console.print(f"[red]SPLADE not available: {err}[/red]") - console.print("[dim]Install with: pip install transformers torch[/dim]") - raise typer.Exit(1) - - # Find index root directory - target_path = path.expanduser().resolve() - - # Determine index root directory (containing _index.db files) - if target_path.is_file() and target_path.name == "_index.db": - index_root = target_path.parent - elif target_path.is_dir(): - # Check for local .codexlens/_index.db - local_index = target_path / ".codexlens" / "_index.db" - if local_index.exists(): - index_root = local_index.parent - else: - # Try to find via registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db = mapper.source_to_index_db(target_path) - if not index_db.exists(): - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(1) - index_root = index_db.parent - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory") - raise typer.Exit(1) - - # Discover all _index.db files recursively - all_index_dbs = sorted(index_root.rglob("_index.db")) - if not all_index_dbs: - console.print(f"[red]Error:[/red] No _index.db files found in {index_root}") - raise typer.Exit(1) - - console.print(f"[blue]Discovered {len(all_index_dbs)} index databases[/blue]") - - # SPLADE index is stored alongside the root _index.db - from codexlens.config import SPLADE_DB_NAME - splade_db = index_root / SPLADE_DB_NAME - - if splade_db.exists() and not rebuild: - console.print("[yellow]SPLADE index exists. Use --rebuild to regenerate.[/yellow]") - return - - # If rebuild, delete existing splade database - if splade_db.exists() and rebuild: - splade_db.unlink() - - # Collect all chunks from all distributed index databases - # Assign globally unique IDs to avoid collisions (each DB starts with ID 1) - console.print(f"[blue]Loading chunks from {len(all_index_dbs)} distributed indexes...[/blue]") - all_chunks = [] # (global_id, chunk) pairs - total_files_checked = 0 - indexes_with_chunks = 0 - global_id = 0 # Sequential global ID across all databases - - for index_db in all_index_dbs: - total_files_checked += 1 - try: - vector_store = VectorStore(index_db) - chunks = vector_store.get_all_chunks() - if chunks: - indexes_with_chunks += 1 - # Assign sequential global IDs to avoid collisions - for chunk in chunks: - global_id += 1 - all_chunks.append((global_id, chunk, index_db)) - if verbose: - console.print(f" [dim]{index_db.parent.name}: {len(chunks)} chunks[/dim]") - vector_store.close() - except Exception as e: - if verbose: - console.print(f" [yellow]Warning: Failed to read {index_db}: {e}[/yellow]") - - if not all_chunks: - console.print("[yellow]No chunks found in any index database[/yellow]") - console.print(f"[dim]Checked {total_files_checked} index files, found 0 chunks[/dim]") - console.print("[dim]Generate embeddings first with 'codexlens embeddings-generate --recursive'[/dim]") - raise typer.Exit(1) - - console.print(f"[blue]Found {len(all_chunks)} chunks across {indexes_with_chunks} indexes[/blue]") - console.print(f"[blue]Encoding with SPLADE...[/blue]") - - # Initialize SPLADE - encoder = get_splade_encoder() - splade_index = SpladeIndex(splade_db) - splade_index.create_tables() - - # Encode in batches with progress bar - chunk_metadata_batch = [] - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Encoding...", total=len(all_chunks)) - for global_id, chunk, source_db_path in all_chunks: - sparse_vec = encoder.encode_text(chunk.content) - splade_index.add_posting(global_id, sparse_vec) - # Store chunk metadata for self-contained search - # Serialize metadata dict to JSON string - metadata_str = None - if hasattr(chunk, 'metadata') and chunk.metadata: - try: - metadata_str = json.dumps(chunk.metadata) if isinstance(chunk.metadata, dict) else chunk.metadata - except Exception: - pass - chunk_metadata_batch.append(( - global_id, - chunk.file_path or "", - chunk.content, - metadata_str, - str(source_db_path) - )) - progress.advance(task) - - # Batch insert chunk metadata - if chunk_metadata_batch: - splade_index.add_chunks_metadata_batch(chunk_metadata_batch) - - # Set metadata - splade_index.set_metadata( - model_name=encoder.model_name, - vocab_size=encoder.vocab_size - ) - - stats = splade_index.get_stats() - console.print(f"[green]OK[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings") - console.print(f" Source indexes: {indexes_with_chunks}") - console.print(f" Database: [dim]{splade_db}[/dim]") - - -@app.command("splade-status", hidden=True, deprecated=True) -def splade_status_command( - path: Path = typer.Argument(..., help="Project path"), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), -) -> None: - """[Deprecated] Use 'codexlens index status' instead. - - Show SPLADE index status and statistics. - - Examples: - codexlens splade-status ~/projects/my-app - codexlens splade-status . - """ - _deprecated_command_warning("splade-status", "index status") - _configure_logging(verbose) - - from codexlens.storage.splade_index import SpladeIndex - from codexlens.semantic.splade_encoder import check_splade_available - from codexlens.config import SPLADE_DB_NAME - - # Find index database - target_path = path.expanduser().resolve() - - if target_path.is_file() and target_path.name == "_index.db": - splade_db = target_path.parent / SPLADE_DB_NAME - elif target_path.is_dir(): - # Check for local .codexlens/_splade.db - local_splade = target_path / ".codexlens" / SPLADE_DB_NAME - if local_splade.exists(): - splade_db = local_splade - else: - # Try to find via registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db = mapper.source_to_index_db(target_path) - splade_db = index_db.parent / SPLADE_DB_NAME - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory") - raise typer.Exit(1) - - if not splade_db.exists(): - console.print("[yellow]No SPLADE index found[/yellow]") - console.print(f"[dim]Run 'codexlens splade-index {path}' to create one[/dim]") - return - - splade_index = SpladeIndex(splade_db) - - if not splade_index.has_index(): - console.print("[yellow]SPLADE tables not initialized[/yellow]") - return - - metadata = splade_index.get_metadata() - stats = splade_index.get_stats() - - # Create status table - table = Table(title="SPLADE Index Status", show_header=False) - table.add_column("Property", style="cyan") - table.add_column("Value") - - table.add_row("Database", str(splade_db)) - if metadata: - table.add_row("Model", metadata['model_name']) - table.add_row("Vocab Size", str(metadata['vocab_size'])) - table.add_row("Chunks", str(stats['unique_chunks'])) - table.add_row("Unique Tokens", str(stats['unique_tokens'])) - table.add_row("Total Postings", str(stats['total_postings'])) - - ok, err = check_splade_available() - status_text = "[green]Yes[/green]" if ok else f"[red]No[/red] - {err}" - table.add_row("SPLADE Available", status_text) - - console.print(table) - - -# ==================== Watch Command ==================== - -@app.command() -def watch( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."), - language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Languages to watch (comma-separated)."), - debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Watch a directory for file changes and incrementally update the index. - - Monitors the specified directory for file system changes (create, modify, delete) - and automatically updates the CodexLens index. The directory must already be indexed - using 'codexlens init' before watching. - - Examples: - # Watch current directory - codexlens watch . - - # Watch with custom debounce interval - codexlens watch . --debounce 2000 - - # Watch only Python and JavaScript files - codexlens watch . --language python,javascript - - Press Ctrl+C to stop watching. - """ - _configure_logging(verbose) - watch_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - # Validate that path is indexed - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - project_record = registry.find_by_source_path(str(watch_path)) - if not project_record: - console.print(f"[red]Error:[/red] Directory is not indexed: {watch_path}") - console.print("[dim]Run 'codexlens init' first to create an index.[/dim]") - raise typer.Exit(code=1) - - # Parse languages - languages = _parse_languages(language) - - # Create watcher config - watcher_config = WatcherConfig( - debounce_ms=debounce, - languages=languages, - ) - - # Display startup message - console.print(f"[green]Starting watcher for:[/green] {watch_path}") - console.print(f"[dim]Debounce interval: {debounce}ms[/dim]") - if languages: - console.print(f"[dim]Watching languages: {', '.join(languages)}[/dim]") - console.print("[dim]Press Ctrl+C to stop[/dim]\n") - - # Create and start watcher manager - manager = WatcherManager( - root_path=watch_path, - watcher_config=watcher_config, - on_indexed=lambda result: _display_index_result(result), - ) - - manager.start() - manager.wait() - - except KeyboardInterrupt: - console.print("\n[yellow]Stopping watcher...[/yellow]") - except CodexLensError as exc: - console.print(f"[red]Watch failed:[/red] {exc}") - raise typer.Exit(code=1) - except Exception as exc: - console.print(f"[red]Unexpected error:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -def _display_index_result(result) -> None: - """Display indexing result in real-time.""" - if result.files_indexed > 0 or result.files_removed > 0: - parts = [] - if result.files_indexed > 0: - parts.append(f"[green]✓ Indexed {result.files_indexed} file(s)[/green]") - if result.files_removed > 0: - parts.append(f"[yellow]✗ Removed {result.files_removed} file(s)[/yellow]") - console.print(" | ".join(parts)) - - if result.errors: - for error in result.errors[:3]: # Show max 3 errors - console.print(f" [red]Error:[/red] {error}") - if len(result.errors) > 3: - console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]") - - - -# ==================== Cascade Index Commands ==================== - - -def get_binary_index_path(db_path: Path) -> Path: - """Get the path for binary ANN index file. - - Args: - db_path: Path to the _index.db file - - Returns: - Path to the binary index file (_index_binary.bin) - """ - return db_path.parent / f"{db_path.stem}_binary.bin" - - -@index_app.command("binary") -def index_binary( - path: Annotated[Path, typer.Argument(help="Directory to index")], - force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False, - batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """Generate cascade embeddings (binary + dense) for two-stage retrieval. - - Cascade retrieval uses a two-stage approach: - 1. Binary search (fast, 32 bytes/vector) -> coarse filtering - 2. Dense rerank (precise, 8KB/vector) -> final results - - This command: - - Finds all _index.db files in the directory - - Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk - - Stores embeddings in the database (embedding_binary, embedding_dense columns) - - Creates a BinaryANNIndex file for fast coarse retrieval - - Examples: - codexlens index binary ~/projects/my-app - codexlens index binary . --force - codexlens index binary . --batch-size 64 --verbose - """ - _configure_logging(verbose, json_mode) - - target_path = path.expanduser().resolve() - - # Find index database(s) - if target_path.is_file() and target_path.name == "_index.db": - index_dbs = [target_path] - elif target_path.is_dir(): - # Check local .codexlens/_index.db first - local_index = target_path / ".codexlens" / "_index.db" - if local_index.exists(): - index_dbs = [local_index] - else: - # Find via registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db = mapper.source_to_index_db(target_path) - if not index_db.exists(): - if json_mode: - print_json(success=False, error=f"No index found for {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - # Find all _index.db files under the index root - index_root = index_db.parent - index_dbs = list(index_root.rglob("_index.db")) - finally: - registry.close() - else: - if json_mode: - print_json(success=False, error="Path must be _index.db file or indexed directory") - else: - console.print("[red]Error:[/red] Path must be _index.db file or indexed directory") - raise typer.Exit(code=1) - - if not index_dbs: - if json_mode: - print_json(success=False, error="No index databases found") - else: - console.print("[yellow]No index databases found[/yellow]") - raise typer.Exit(code=1) - - # Import cascade embedding backend - try: - from codexlens.indexing.embedding import CascadeEmbeddingBackend - from codexlens.semantic.ann_index import BinaryANNIndex - from codexlens.indexing.embedding import pack_binary_embedding - except ImportError as e: - error_msg = f"Cascade embedding dependencies not available: {e}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print("[dim]Install with: pip install codexlens[semantic][/dim]") - raise typer.Exit(code=1) - - if not json_mode: - console.print(f"[bold]Generating cascade embeddings[/bold]") - console.print(f"Path: [dim]{target_path}[/dim]") - console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]") - console.print(f"Batch size: [cyan]{batch_size}[/cyan]") - console.print() - - # Initialize cascade embedding backend - try: - cascade_backend = CascadeEmbeddingBackend() - except Exception as e: - error_msg = f"Failed to initialize cascade embedding backend: {e}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - # Process statistics - total_chunks_processed = 0 - total_indexes_processed = 0 - total_indexes_successful = 0 - total_binary_indexes_created = 0 - errors_list: List[str] = [] - - # Process each index database - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("({task.completed}/{task.total})"), - TimeElapsedColumn(), - console=console, - disable=json_mode, - ) as progress: - db_task = progress.add_task("Processing indexes...", total=len(index_dbs)) - - for db_path in index_dbs: - total_indexes_processed += 1 - index_name = db_path.parent.name - - try: - # Open the index store - store = DirIndexStore(db_path) - store.initialize() - - # Get connection for direct queries - conn = store._get_connection() - - # Ensure cascade columns exist in semantic_chunks table - try: - conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB") - except Exception: - pass # Column already exists - try: - conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB") - except Exception: - pass # Column already exists - conn.commit() - - # Check if semantic_chunks table exists and has data - try: - cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") - chunk_count = cursor.fetchone()[0] - except Exception: - # semantic_chunks table doesn't exist or is empty - chunk_count = 0 - - if chunk_count == 0: - if verbose and not json_mode: - console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]") - progress.advance(db_task) - store.close() - continue - - # Check if embeddings already exist (unless force) - if not force: - cursor = conn.execute( - "SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL" - ) - existing_count = cursor.fetchone()[0] - if existing_count > 0: - if verbose and not json_mode: - console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]") - progress.advance(db_task) - store.close() - continue - - # If force, clear existing cascade embeddings - if force: - conn.execute( - "UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL" - ) - conn.commit() - - # Get all chunks - cursor = conn.execute("SELECT id, content FROM semantic_chunks") - chunks = cursor.fetchall() - - if not chunks: - progress.advance(db_task) - store.close() - continue - - if verbose and not json_mode: - console.print(f" Processing {index_name}: {len(chunks)} chunks") - - # Process in batches - chunk_task = progress.add_task( - f" {index_name}", total=len(chunks) - ) - - # Prepare for BinaryANNIndex - binary_index_path = get_binary_index_path(db_path) - binary_ann_index = BinaryANNIndex(db_path, dim=256) - - for i in range(0, len(chunks), batch_size): - batch_chunks = chunks[i:i + batch_size] - batch_ids = [c[0] for c in batch_chunks] - batch_contents = [c[1] for c in batch_chunks] - - # Generate cascade embeddings - binary_embeddings, dense_embeddings = cascade_backend.encode_cascade( - batch_contents, batch_size=batch_size - ) - - # Pack binary embeddings and convert dense to bytes - packed_binaries = [] - dense_bytes_list = [] - - for j in range(len(batch_ids)): - # Pack binary embedding (256 bits -> 32 bytes) - packed_binary = pack_binary_embedding(binary_embeddings[j]) - packed_binaries.append(packed_binary) - - # Convert dense embedding to bytes - import numpy as np - dense_blob = dense_embeddings[j].astype(np.float32).tobytes() - dense_bytes_list.append(dense_blob) - - # Update database - for j, chunk_id in enumerate(batch_ids): - conn.execute( - """ - UPDATE semantic_chunks - SET embedding_binary = ?, embedding_dense = ? - WHERE id = ? - """, - (packed_binaries[j], dense_bytes_list[j], chunk_id) - ) - - # Add to binary ANN index - binary_ann_index.add_vectors(batch_ids, packed_binaries) - - conn.commit() - total_chunks_processed += len(batch_ids) - progress.advance(chunk_task, len(batch_ids)) - - # Save binary ANN index - binary_ann_index.save() - total_binary_indexes_created += 1 - - progress.remove_task(chunk_task) - store.close() - total_indexes_successful += 1 - - except Exception as e: - error_msg = f"{index_name}: {e}" - errors_list.append(error_msg) - if verbose and not json_mode: - console.print(f" [red]Error processing {index_name}:[/red] {e}") - - progress.advance(db_task) - - # Build result - result = { - "path": str(target_path), - "indexes_processed": total_indexes_processed, - "indexes_successful": total_indexes_successful, - "chunks_processed": total_chunks_processed, - "binary_indexes_created": total_binary_indexes_created, - "errors": len(errors_list), - "error_details": errors_list[:5] if errors_list else [], - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"\n[green]Cascade indexing complete[/green]") - console.print(f" Indexes processed: {total_indexes_processed}") - console.print(f" Indexes successful: {total_indexes_successful}") - console.print(f" Chunks processed: {total_chunks_processed:,}") - console.print(f" Binary indexes created: {total_binary_indexes_created}") - if errors_list: - console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]") - for err in errors_list[:3]: - console.print(f" [dim]{err}[/dim]") - if len(errors_list) > 3: - console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]") - - -# ==================== Index Status Command ==================== - -@index_app.command("status") -def index_status( - path: Optional[Path] = typer.Argument( - None, - help="Path to project directory or _index.db file. If not specified, uses default index root.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), -) -> None: - """Show comprehensive index status (embeddings + SPLADE). - - Shows combined status for all index types: - - Dense vector embeddings (HNSW) - - SPLADE sparse embeddings - - Binary cascade embeddings - - Examples: - codexlens index status # Check all indexes - codexlens index status ~/projects/my-app # Check specific project - codexlens index status --json # JSON output - """ - _configure_logging(verbose, json_mode) - - from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary - from codexlens.storage.splade_index import SpladeIndex - from codexlens.semantic.splade_encoder import check_splade_available - from codexlens.config import SPLADE_DB_NAME - - # Determine target path and index root - if path is None: - index_root = _get_index_root() - target_path = None - else: - target_path = path.resolve() - if target_path.is_file() and target_path.name == "_index.db": - index_root = target_path.parent - elif target_path.is_dir(): - # Try to find index for this project - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - if index_path.exists(): - index_root = index_path.parent - else: - if json_mode: - print_json(success=False, error=f"No index found for {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens index init' first to create an index") - raise typer.Exit(code=1) - finally: - registry.close() - else: - if json_mode: - print_json(success=False, error="Path must be _index.db file or directory") - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - # Get embeddings status - embeddings_result = get_embedding_stats_summary(index_root) - - # Get SPLADE status - splade_db = index_root / SPLADE_DB_NAME - splade_status = { - "available": False, - "has_index": False, - "stats": None, - "metadata": None, - } - - splade_available, splade_err = check_splade_available() - splade_status["available"] = splade_available - - if splade_db.exists(): - try: - splade_index = SpladeIndex(splade_db) - if splade_index.has_index(): - splade_status["has_index"] = True - splade_status["stats"] = splade_index.get_stats() - splade_status["metadata"] = splade_index.get_metadata() - splade_index.close() - except Exception as e: - if verbose: - console.print(f"[yellow]Warning: Failed to read SPLADE index: {e}[/yellow]") - - # Build combined result - result = { - "index_root": str(index_root), - "embeddings": embeddings_result.get("result") if embeddings_result.get("success") else None, - "embeddings_error": embeddings_result.get("error") if not embeddings_result.get("success") else None, - "splade": splade_status, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[bold]Index Status[/bold]") - console.print(f"Index root: [dim]{index_root}[/dim]\n") - - # Embeddings section - console.print("[bold]Dense Embeddings (HNSW):[/bold]") - if embeddings_result.get("success"): - data = embeddings_result["result"] - total = data.get("total_indexes", 0) - with_emb = data.get("indexes_with_embeddings", 0) - total_chunks = data.get("total_chunks", 0) - - console.print(f" Total indexes: {total}") - console.print(f" Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}") - console.print(f" Total chunks: {total_chunks:,}") - else: - console.print(f" [yellow]--[/yellow] {embeddings_result.get('error', 'Not available')}") - - # SPLADE section - console.print("\n[bold]SPLADE Sparse Index:[/bold]") - if splade_status["has_index"]: - stats = splade_status["stats"] or {} - metadata = splade_status["metadata"] or {} - console.print(f" [green]OK[/green] SPLADE index available") - console.print(f" Chunks: {stats.get('unique_chunks', 0):,}") - console.print(f" Unique tokens: {stats.get('unique_tokens', 0):,}") - console.print(f" Total postings: {stats.get('total_postings', 0):,}") - if metadata.get("model_name"): - console.print(f" Model: {metadata['model_name']}") - elif splade_available: - console.print(f" [yellow]--[/yellow] No SPLADE index found") - console.print(f" [dim]Run 'codexlens index splade ' to create one[/dim]") - else: - console.print(f" [yellow]--[/yellow] SPLADE not available: {splade_err}") - - # Runtime availability - console.print("\n[bold]Runtime Availability:[/bold]") - console.print(f" SPLADE encoder: {'[green]Yes[/green]' if splade_available else f'[red]No[/red] ({splade_err})'}") - - -# ==================== Index Update Command ==================== - -@index_app.command("update") -def index_update( - file_path: Path = typer.Argument(..., exists=True, file_okay=True, dir_okay=False, help="Path to the file to update in the index."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Update the index for a single file incrementally. - - This is a lightweight command designed for use in hooks (e.g., Claude Code PostToolUse). - It updates only the specified file without scanning the entire directory. - - The file's parent directory must already be indexed via 'codexlens index init'. - - Examples: - codexlens index update src/main.py # Update single file - codexlens index update ./foo.ts --json # JSON output for hooks - """ - _configure_logging(verbose, json_mode) - - from codexlens.watcher.incremental_indexer import IncrementalIndexer - - registry: RegistryStore | None = None - indexer: IncrementalIndexer | None = None - - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - config = Config() - - resolved_path = file_path.resolve() - - # Check if project is indexed - source_root = mapper.get_project_root(resolved_path) - if not source_root or not registry.get_project(source_root): - error_msg = f"Project containing file is not indexed: {file_path}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print("[dim]Run 'codexlens index init' on the project root first.[/dim]") - raise typer.Exit(code=1) - - indexer = IncrementalIndexer(registry, mapper, config) - result = indexer._index_file(resolved_path) - - if result.success: - if json_mode: - print_json(success=True, result={ - "path": str(result.path), - "symbols_count": result.symbols_count, - "status": "updated", - }) - else: - console.print(f"[green]✓[/green] Updated index for [bold]{result.path.name}[/bold] ({result.symbols_count} symbols)") - else: - error_msg = result.error or f"Failed to update index for {file_path}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Update failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if indexer: - indexer.close() - if registry: - registry.close() - - -# ==================== Index All Command ==================== - -@index_app.command("all") -def index_all( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit indexing to specific languages (repeat or comma-separated).", - ), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."), - backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend: fastembed or litellm."), - model: str = typer.Option("code", "--model", "-m", help="Embedding model profile or name."), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."), - skip_splade: bool = typer.Option(False, "--skip-splade", help="Skip SPLADE index generation."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Run all indexing operations in sequence (init, embeddings, splade). - - This is a convenience command that runs the complete indexing pipeline: - 1. FTS index initialization (index init) - 2. Dense vector embeddings (index embeddings) - 3. SPLADE sparse index (index splade) - unless --skip-splade - - Examples: - codexlens index all ~/projects/my-app - codexlens index all . --force - codexlens index all . --backend litellm --model text-embedding-3-small - codexlens index all . --skip-splade - """ - _configure_logging(verbose, json_mode) - - base_path = path.expanduser().resolve() - results = { - "path": str(base_path), - "steps": {}, - } - - # Step 1: Run init - if not json_mode: - console.print(f"[bold]Step 1/3: Initializing FTS index...[/bold]") - - try: - # Import and call the init function directly - from codexlens.config import Config - from codexlens.storage.index_tree import IndexTreeBuilder - - config = Config() - languages = _parse_languages(language) - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - builder = IndexTreeBuilder(registry, mapper, config, incremental=not force) - build_result = builder.build( - source_root=base_path, - languages=languages, - workers=workers, - force_full=force, - ) - - results["steps"]["init"] = { - "success": True, - "files_indexed": build_result.total_files, - "dirs_indexed": build_result.total_dirs, - "index_root": str(build_result.index_root), - } - - if not json_mode: - console.print(f" [green]OK[/green] Indexed {build_result.total_files} files in {build_result.total_dirs} directories") - - index_root = Path(build_result.index_root) - registry.close() - - except Exception as e: - results["steps"]["init"] = {"success": False, "error": str(e)} - if json_mode: - print_json(success=False, result=results, error=f"Init failed: {e}") - else: - console.print(f" [red]Error:[/red] {e}") - raise typer.Exit(code=1) - - # Step 2: Generate embeddings - if not json_mode: - console.print(f"\n[bold]Step 2/3: Generating dense embeddings...[/bold]") - - try: - from codexlens.cli.embedding_manager import generate_dense_embeddings_centralized - - def progress_update(msg: str): - if not json_mode and verbose: - console.print(f" {msg}") - - embed_result = generate_dense_embeddings_centralized( - index_root, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=2000, - progress_callback=progress_update, - max_workers=max_workers, - ) - - if embed_result["success"]: - data = embed_result["result"] - results["steps"]["embeddings"] = { - "success": True, - "chunks_created": data.get("chunks_created", 0), - "files_processed": data.get("files_processed", 0), - } - if not json_mode: - console.print(f" [green]OK[/green] Generated {data.get('chunks_created', 0)} chunks for {data.get('files_processed', 0)} files") - else: - results["steps"]["embeddings"] = { - "success": False, - "error": embed_result.get("error"), - } - if not json_mode: - console.print(f" [yellow]Warning:[/yellow] {embed_result.get('error', 'Unknown error')}") - - except Exception as e: - results["steps"]["embeddings"] = {"success": False, "error": str(e)} - if not json_mode: - console.print(f" [yellow]Warning:[/yellow] {e}") - - # Step 3: Generate SPLADE index (unless skipped) - if not skip_splade: - if not json_mode: - console.print(f"\n[bold]Step 3/3: Generating SPLADE index...[/bold]") - - try: - from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available - from codexlens.storage.splade_index import SpladeIndex - from codexlens.semantic.vector_store import VectorStore - from codexlens.config import SPLADE_DB_NAME - - ok, err = check_splade_available() - if not ok: - results["steps"]["splade"] = {"success": False, "error": f"SPLADE not available: {err}"} - if not json_mode: - console.print(f" [yellow]Skipped:[/yellow] SPLADE not available ({err})") - else: - # Discover all _index.db files - all_index_dbs = sorted(index_root.rglob("_index.db")) - if not all_index_dbs: - results["steps"]["splade"] = {"success": False, "error": "No index databases found"} - if not json_mode: - console.print(f" [yellow]Skipped:[/yellow] No index databases found") - else: - # Collect chunks - all_chunks = [] - global_id = 0 - for index_db in all_index_dbs: - try: - vector_store = VectorStore(index_db) - chunks = vector_store.get_all_chunks() - for chunk in chunks: - global_id += 1 - all_chunks.append((global_id, chunk, index_db)) - vector_store.close() - except Exception: - pass - - if all_chunks: - splade_db = index_root / SPLADE_DB_NAME - if splade_db.exists() and force: - splade_db.unlink() - - encoder = get_splade_encoder() - splade_index = SpladeIndex(splade_db) - splade_index.create_tables() - - chunk_metadata_batch = [] - import json as json_module - for gid, chunk, source_db_path in all_chunks: - sparse_vec = encoder.encode_text(chunk.content) - splade_index.add_posting(gid, sparse_vec) - metadata_str = None - if hasattr(chunk, 'metadata') and chunk.metadata: - try: - metadata_str = json_module.dumps(chunk.metadata) if isinstance(chunk.metadata, dict) else chunk.metadata - except Exception: - pass - chunk_metadata_batch.append(( - gid, - chunk.file_path or "", - chunk.content, - metadata_str, - str(source_db_path) - )) - - if chunk_metadata_batch: - splade_index.add_chunks_metadata_batch(chunk_metadata_batch) - - splade_index.set_metadata( - model_name=encoder.model_name, - vocab_size=encoder.vocab_size - ) - - stats = splade_index.get_stats() - results["steps"]["splade"] = { - "success": True, - "chunks": stats['unique_chunks'], - "postings": stats['total_postings'], - } - if not json_mode: - console.print(f" [green]OK[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings") - else: - results["steps"]["splade"] = {"success": False, "error": "No chunks found"} - if not json_mode: - console.print(f" [yellow]Skipped:[/yellow] No chunks found in indexes") - - except Exception as e: - results["steps"]["splade"] = {"success": False, "error": str(e)} - if not json_mode: - console.print(f" [yellow]Warning:[/yellow] {e}") - else: - results["steps"]["splade"] = {"success": True, "skipped": True} - if not json_mode: - console.print(f"\n[bold]Step 3/3: SPLADE index...[/bold]") - console.print(f" [dim]Skipped (--skip-splade)[/dim]") - - # Summary - if json_mode: - print_json(success=True, result=results) - else: - console.print(f"\n[bold]Indexing Complete[/bold]") - init_ok = results["steps"].get("init", {}).get("success", False) - emb_ok = results["steps"].get("embeddings", {}).get("success", False) - splade_ok = results["steps"].get("splade", {}).get("success", False) - console.print(f" FTS Index: {'[green]OK[/green]' if init_ok else '[red]Failed[/red]'}") - console.print(f" Embeddings: {'[green]OK[/green]' if emb_ok else '[yellow]Partial/Skipped[/yellow]'}") - console.print(f" SPLADE: {'[green]OK[/green]' if splade_ok else '[yellow]Partial/Skipped[/yellow]'}") - - -# ==================== Index Migration Commands ==================== - -# Index version for migration tracking (file-based version marker) -INDEX_FORMAT_VERSION = "2.0" -INDEX_VERSION_FILE = "_index_version.txt" - - -def _get_index_version(index_root: Path) -> Optional[str]: - """Read index format version from version marker file. - - Args: - index_root: Root directory of the index - - Returns: - Version string if file exists, None otherwise - """ - version_file = index_root / INDEX_VERSION_FILE - if version_file.exists(): - try: - return version_file.read_text(encoding="utf-8").strip() - except Exception: - return None - return None - - -def _set_index_version(index_root: Path, version: str) -> None: - """Write index format version to version marker file. - - Args: - index_root: Root directory of the index - version: Version string to write - """ - version_file = index_root / INDEX_VERSION_FILE - version_file.write_text(version, encoding="utf-8") - - -def _discover_distributed_splade(index_root: Path) -> List[Dict[str, Any]]: - """Discover distributed SPLADE data in _index.db files. - - Scans all _index.db files for embedded splade_postings tables. - This is the old distributed format that needs migration. - - Args: - index_root: Root directory to scan - - Returns: - List of dicts with db_path, posting_count, chunk_count - """ - results = [] - - for db_path in index_root.rglob("_index.db"): - try: - conn = sqlite3.connect(db_path, timeout=5.0) - conn.row_factory = sqlite3.Row - - # Check if splade_postings table exists (old embedded format) - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='splade_postings'" - ) - if cursor.fetchone(): - # Count postings and chunks - try: - row = conn.execute( - "SELECT COUNT(*) as postings, COUNT(DISTINCT chunk_id) as chunks FROM splade_postings" - ).fetchone() - results.append({ - "db_path": db_path, - "posting_count": row["postings"] if row else 0, - "chunk_count": row["chunks"] if row else 0, - }) - except Exception: - pass - - conn.close() - except Exception: - pass - - return results - - -def _discover_distributed_hnsw(index_root: Path) -> List[Dict[str, Any]]: - """Discover distributed HNSW index files. - - Scans for .hnsw files that are stored alongside _index.db files. - This is the old distributed format that needs migration. - - Args: - index_root: Root directory to scan - - Returns: - List of dicts with hnsw_path, size_bytes - """ - results = [] - - for hnsw_path in index_root.rglob("*.hnsw"): - try: - size = hnsw_path.stat().st_size - results.append({ - "hnsw_path": hnsw_path, - "size_bytes": size, - }) - except Exception: - pass - - return results - - -def _check_centralized_storage(index_root: Path) -> Dict[str, Any]: - """Check for centralized storage files. - - Args: - index_root: Root directory to check - - Returns: - Dict with has_splade, has_vectors, splade_stats, vector_stats - """ - from codexlens.config import SPLADE_DB_NAME, VECTORS_HNSW_NAME - - splade_db = index_root / SPLADE_DB_NAME - vectors_hnsw = index_root / VECTORS_HNSW_NAME - - result = { - "has_splade": splade_db.exists(), - "has_vectors": vectors_hnsw.exists(), - "splade_path": str(splade_db) if splade_db.exists() else None, - "vectors_path": str(vectors_hnsw) if vectors_hnsw.exists() else None, - "splade_stats": None, - "vector_stats": None, - } - - # Get SPLADE stats if exists - if splade_db.exists(): - try: - from codexlens.storage.splade_index import SpladeIndex - splade = SpladeIndex(splade_db) - if splade.has_index(): - result["splade_stats"] = splade.get_stats() - splade.close() - except Exception: - pass - - # Get vector stats if exists - if vectors_hnsw.exists(): - try: - result["vector_stats"] = { - "size_bytes": vectors_hnsw.stat().st_size, - } - except Exception: - pass - - return result - - -@index_app.command("migrate") -def index_migrate_cmd( - path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None, - dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated without making changes")] = False, - force: Annotated[bool, typer.Option("--force", help="Force migration even if already migrated")] = False, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False, -) -> None: - """Migrate old distributed index to new centralized architecture. - - This command upgrades indexes from the old distributed storage format - (where SPLADE/vectors were stored in each _index.db) to the new centralized - format (single _splade.db and _vectors.hnsw at index root). - - Migration Steps: - 1. Detect if migration is needed (check version marker) - 2. Discover distributed SPLADE data in _index.db files - 3. Discover distributed .hnsw files - 4. Report current status - 5. Create version marker (unless --dry-run) - - Use --dry-run to preview what would be migrated without making changes. - Use --force to re-run migration even if version marker exists. - - Note: For full data migration (SPLADE/vectors consolidation), run: - codexlens index splade --rebuild - codexlens index embeddings --force - - Examples: - codexlens index migrate ~/projects/my-app --dry-run - codexlens index migrate . --force - codexlens index migrate --json - """ - _configure_logging(verbose, json_mode) - - # Resolve target path - if path: - target_path = Path(path).expanduser().resolve() - else: - target_path = Path.cwd() - - if not target_path.exists(): - if json_mode: - print_json(success=False, error=f"Path does not exist: {target_path}") - else: - console.print(f"[red]Error:[/red] Path does not exist: {target_path}") - raise typer.Exit(code=1) - - # Find index root - registry: RegistryStore | None = None - index_root: Optional[Path] = None - - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Check if path is a project with an index - project_info = registry.get_project(target_path) - if project_info: - index_root = Path(project_info.index_root) - else: - # Try to find index via mapper - index_db = mapper.source_to_index_db(target_path) - if index_db.exists(): - index_root = index_db.parent - finally: - if registry: - registry.close() - - if not index_root or not index_root.exists(): - if json_mode: - print_json(success=False, error=f"No index found for: {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for: {target_path}") - console.print("[dim]Run 'codexlens init' first to create an index.[/dim]") - raise typer.Exit(code=1) - - if not json_mode: - console.print(f"[bold]Index Migration Check[/bold]") - console.print(f"Source path: [dim]{target_path}[/dim]") - console.print(f"Index root: [dim]{index_root}[/dim]") - if dry_run: - console.print("[yellow]Mode: DRY RUN (no changes will be made)[/yellow]") - console.print() - - # Check current version - current_version = _get_index_version(index_root) - needs_migration = current_version is None or (force and current_version != INDEX_FORMAT_VERSION) - - if current_version and current_version >= INDEX_FORMAT_VERSION and not force: - result = { - "path": str(target_path), - "index_root": str(index_root), - "current_version": current_version, - "target_version": INDEX_FORMAT_VERSION, - "needs_migration": False, - "message": "Index is already at the latest version", - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]OK[/green] Index is already at version {current_version}") - console.print("[dim]No migration needed. Use --force to re-run migration.[/dim]") - return - - # Discover distributed data - distributed_splade = _discover_distributed_splade(index_root) - distributed_hnsw = _discover_distributed_hnsw(index_root) - centralized = _check_centralized_storage(index_root) - - # Count all _index.db files - all_index_dbs = list(index_root.rglob("_index.db")) - - # Build migration report - migration_report = { - "path": str(target_path), - "index_root": str(index_root), - "dry_run": dry_run, - "current_version": current_version, - "target_version": INDEX_FORMAT_VERSION, - "needs_migration": needs_migration, - "discovery": { - "total_index_dbs": len(all_index_dbs), - "distributed_splade_count": len(distributed_splade), - "distributed_splade_total_postings": sum(d["posting_count"] for d in distributed_splade), - "distributed_hnsw_count": len(distributed_hnsw), - "distributed_hnsw_total_bytes": sum(d["size_bytes"] for d in distributed_hnsw), - }, - "centralized": centralized, - "recommendations": [], - } - - # Generate recommendations - if distributed_splade and not centralized["has_splade"]: - migration_report["recommendations"].append( - f"Run 'codexlens splade-index {target_path} --rebuild' to consolidate SPLADE data" - ) - - if distributed_hnsw and not centralized["has_vectors"]: - migration_report["recommendations"].append( - f"Run 'codexlens embeddings-generate {target_path} --recursive --force' to consolidate vector data" - ) - - if not distributed_splade and not distributed_hnsw: - migration_report["recommendations"].append( - "No distributed data found. Index may already be using centralized storage." - ) - - if json_mode: - # Perform migration action (set version marker) unless dry-run - if not dry_run and needs_migration: - _set_index_version(index_root, INDEX_FORMAT_VERSION) - migration_report["migrated"] = True - migration_report["new_version"] = INDEX_FORMAT_VERSION - else: - migration_report["migrated"] = False - - print_json(success=True, result=migration_report) - else: - # Display discovery results - console.print("[bold]Discovery Results:[/bold]") - console.print(f" Total _index.db files: {len(all_index_dbs)}") - console.print() - - # Distributed SPLADE - console.print("[bold]Distributed SPLADE Data:[/bold]") - if distributed_splade: - total_postings = sum(d["posting_count"] for d in distributed_splade) - total_chunks = sum(d["chunk_count"] for d in distributed_splade) - console.print(f" Found in {len(distributed_splade)} _index.db files") - console.print(f" Total postings: {total_postings:,}") - console.print(f" Total chunks: {total_chunks:,}") - if verbose: - for d in distributed_splade[:5]: - console.print(f" [dim]{d['db_path'].parent.name}: {d['posting_count']} postings[/dim]") - if len(distributed_splade) > 5: - console.print(f" [dim]... and {len(distributed_splade) - 5} more[/dim]") - else: - console.print(" [dim]None found (already centralized or not generated)[/dim]") - console.print() - - # Distributed HNSW - console.print("[bold]Distributed HNSW Files:[/bold]") - if distributed_hnsw: - total_size = sum(d["size_bytes"] for d in distributed_hnsw) - console.print(f" Found {len(distributed_hnsw)} .hnsw files") - console.print(f" Total size: {total_size / (1024 * 1024):.1f} MB") - if verbose: - for d in distributed_hnsw[:5]: - console.print(f" [dim]{d['hnsw_path'].name}: {d['size_bytes'] / 1024:.1f} KB[/dim]") - if len(distributed_hnsw) > 5: - console.print(f" [dim]... and {len(distributed_hnsw) - 5} more[/dim]") - else: - console.print(" [dim]None found (already centralized or not generated)[/dim]") - console.print() - - # Centralized storage status - console.print("[bold]Centralized Storage:[/bold]") - if centralized["has_splade"]: - stats = centralized.get("splade_stats") or {} - console.print(f" [green]OK[/green] _splade.db exists") - if stats: - console.print(f" Chunks: {stats.get('unique_chunks', 0):,}") - console.print(f" Postings: {stats.get('total_postings', 0):,}") - else: - console.print(f" [yellow]--[/yellow] _splade.db not found") - - if centralized["has_vectors"]: - stats = centralized.get("vector_stats") or {} - size_mb = stats.get("size_bytes", 0) / (1024 * 1024) - console.print(f" [green]OK[/green] _vectors.hnsw exists ({size_mb:.1f} MB)") - else: - console.print(f" [yellow]--[/yellow] _vectors.hnsw not found") - console.print() - - # Migration action - if not dry_run and needs_migration: - _set_index_version(index_root, INDEX_FORMAT_VERSION) - console.print(f"[green]OK[/green] Version marker created: {INDEX_FORMAT_VERSION}") - elif dry_run: - console.print(f"[yellow]DRY RUN:[/yellow] Would create version marker: {INDEX_FORMAT_VERSION}") - - # Recommendations - if migration_report["recommendations"]: - console.print("\n[bold]Recommendations:[/bold]") - for rec in migration_report["recommendations"]: - console.print(f" [cyan]>[/cyan] {rec}") - - -# ==================== Deprecated Command Aliases ==================== -# These commands maintain backward compatibility with the old CLI structure. -# They display deprecation warnings and delegate to the new `index` subcommands. - - -@app.command("embeddings-generate", hidden=True, deprecated=True) -def embeddings_generate_deprecated( - path: Path = typer.Argument( - ..., - exists=True, - help="Path to _index.db file or project directory.", - ), - backend: str = typer.Option( - "fastembed", - "--backend", - "-b", - help="Embedding backend: fastembed (local) or litellm (remote API).", - ), - model: str = typer.Option( - "code", - "--model", - "-m", - help="Model: profile name for fastembed or model name for litellm.", - ), - force: bool = typer.Option( - False, - "--force", - "-f", - help="Force regeneration even if embeddings exist.", - ), - chunk_size: int = typer.Option( - 2000, - "--chunk-size", - help="Maximum chunk size in characters.", - ), - max_workers: int = typer.Option( - 1, - "--max-workers", - "-w", - min=1, - help="Max concurrent API calls.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), - centralized: bool = typer.Option( - True, - "--centralized/--distributed", - "-c/-d", - help="Use centralized vector storage (default) or distributed.", - ), -) -> None: - """[Deprecated] Use 'codexlens index embeddings' instead.""" - _deprecated_command_warning("embeddings-generate", "index embeddings") - index_embeddings( - path=path, - backend=backend, - model=model, - force=force, - chunk_size=chunk_size, - max_workers=max_workers, - json_mode=json_mode, - verbose=verbose, - centralized=centralized, - ) - - -@app.command("init", hidden=True, deprecated=True) -def init_deprecated( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Limit indexing to specific languages."), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."), - no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation."), - backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend."), - model: str = typer.Option("code", "--model", "-m", help="Embedding model."), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """[Deprecated] Use 'codexlens index init' instead.""" - _deprecated_command_warning("init", "index init") - index_init( - path=path, - language=language, - workers=workers, - force=force, - no_embeddings=no_embeddings, - backend=backend, - model=model, - max_workers=max_workers, - json_mode=json_mode, - verbose=verbose, - ) - - -@app.command("splade-index", hidden=True, deprecated=True) -def splade_index_deprecated( - path: Path = typer.Argument(..., help="Project path to index"), - rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), -) -> None: - """[Deprecated] Use 'codexlens index splade' instead.""" - _deprecated_command_warning("splade-index", "index splade") - index_splade( - path=path, - rebuild=rebuild, - verbose=verbose, - ) - - -@app.command("cascade-index", hidden=True, deprecated=True) -def cascade_index_deprecated( - path: Annotated[Path, typer.Argument(help="Directory to index")], - force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False, - batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """[Deprecated] Use 'codexlens index binary' instead.""" - _deprecated_command_warning("cascade-index", "index binary") - index_binary( - path=path, - force=force, - batch_size=batch_size, - json_mode=json_mode, - verbose=verbose, - ) - - -@app.command("index-migrate", hidden=True, deprecated=True) -def index_migrate_deprecated( - path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None, - dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated")] = False, - force: Annotated[bool, typer.Option("--force", help="Force migration")] = False, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False, -) -> None: - """[Deprecated] Use 'codexlens index migrate' instead.""" - _deprecated_command_warning("index-migrate", "index migrate") - index_migrate_cmd( - path=path, - dry_run=dry_run, - force=force, - json_mode=json_mode, - verbose=verbose, - ) - diff --git a/codex-lens/build/lib/codexlens/cli/embedding_manager.py b/codex-lens/build/lib/codexlens/cli/embedding_manager.py deleted file mode 100644 index bb6467f5..00000000 --- a/codex-lens/build/lib/codexlens/cli/embedding_manager.py +++ /dev/null @@ -1,2001 +0,0 @@ -"""Embedding Manager - Manage semantic embeddings for code indexes.""" - -import gc -import json -import logging -import sqlite3 -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from itertools import islice -from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Tuple - -try: - from codexlens.semantic import SEMANTIC_AVAILABLE, is_embedding_backend_available -except ImportError: - SEMANTIC_AVAILABLE = False - def is_embedding_backend_available(_backend: str): # type: ignore[no-redef] - return False, "codexlens.semantic not available" - -try: - from codexlens.config import VECTORS_META_DB_NAME -except ImportError: - VECTORS_META_DB_NAME = "_vectors_meta.db" - -try: - from codexlens.search.ranking import get_file_category -except ImportError: - def get_file_category(path: str): # type: ignore[no-redef] - """Fallback: map common extensions to category.""" - ext = Path(path).suffix.lower() - code_exts = {".py", ".js", ".jsx", ".ts", ".tsx", ".java", ".go", ".c", ".cpp", ".rs"} - doc_exts = {".md", ".mdx", ".txt", ".rst"} - if ext in code_exts: - return "code" - elif ext in doc_exts: - return "doc" - return None - -logger = logging.getLogger(__name__) - -# Embedding batch size - larger values improve throughput on modern hardware -# Benchmark: 256 gives ~2.35x speedup over 64 with DirectML GPU acceleration -EMBEDDING_BATCH_SIZE = 256 - - -def calculate_dynamic_batch_size(config, embedder) -> int: - """Calculate batch size dynamically based on model token capacity. - - This function computes an optimal batch size by considering: - - Maximum chunk character size from parsing rules - - Estimated tokens per chunk (chars / chars_per_token_estimate) - - Model's maximum token capacity - - Utilization factor (default 80% to leave headroom) - - Args: - config: Config object with api_batch_size_* settings - embedder: Embedding model object with max_tokens property - - Returns: - Calculated batch size, clamped to [1, api_batch_size_max] - """ - # If dynamic calculation is disabled, return static value - if not getattr(config, 'api_batch_size_dynamic', False): - return getattr(config, 'api_batch_size', 8) - - # Get maximum chunk character size from ALL parsing rules (not just default) - # This ensures we use the worst-case chunk size across all languages - parsing_rules = getattr(config, 'parsing_rules', {}) - all_max_chunk_chars = [ - rule.get('max_chunk_chars', 0) - for rule in parsing_rules.values() - if isinstance(rule, dict) - ] - max_chunk_chars = max(all_max_chunk_chars) if all_max_chunk_chars else 4000 - if max_chunk_chars <= 0: - max_chunk_chars = 4000 # Final fallback - - # Get characters per token estimate - chars_per_token = getattr(config, 'chars_per_token_estimate', 4) - if chars_per_token <= 0: - chars_per_token = 4 # Safe default - - # Estimate tokens per chunk - estimated_tokens_per_chunk = max_chunk_chars / chars_per_token - - # Prevent division by zero - if estimated_tokens_per_chunk <= 0: - return getattr(config, 'api_batch_size', 8) - - # Get model's maximum token capacity - model_max_tokens = getattr(embedder, 'max_tokens', 8192) - - # Get utilization factor (default 80%, max 95% to leave safety margin) - utilization_factor = getattr(config, 'api_batch_size_utilization_factor', 0.8) - if utilization_factor <= 0 or utilization_factor > 0.95: - if utilization_factor > 0.95: - logger.warning( - "Utilization factor %.2f exceeds safe limit 0.95. " - "Token estimation is approximate, high values risk API errors. " - "Clamping to 0.95.", - utilization_factor - ) - utilization_factor = 0.95 - else: - utilization_factor = 0.8 - - # Calculate safe token limit - safe_token_limit = model_max_tokens * utilization_factor - - # Calculate dynamic batch size - dynamic_batch_size = int(safe_token_limit / estimated_tokens_per_chunk) - - # Get maximum batch size limit - batch_size_max = getattr(config, 'api_batch_size_max', 2048) - - # Clamp to [1, batch_size_max] - result = max(1, min(dynamic_batch_size, batch_size_max)) - - logger.debug( - "Dynamic batch size calculated: %d (max_chunk_chars=%d, chars_per_token=%d, " - "model_max_tokens=%d, utilization=%.1f%%, limit=%d)", - result, max_chunk_chars, chars_per_token, model_max_tokens, - utilization_factor * 100, batch_size_max - ) - - return result - - -def _build_categories_from_batch(chunk_batch: List[Tuple[Any, str]]) -> List[str]: - """Build categories list from chunk batch for index-level category filtering. - - Args: - chunk_batch: List of (chunk, file_path) tuples - - Returns: - List of category strings ('code' or 'doc'), defaulting to 'code' for unknown - """ - categories = [] - for _, file_path in chunk_batch: - cat = get_file_category(file_path) - categories.append(cat if cat else "code") # Default to 'code' for unknown extensions - return categories - - -def _cleanup_fastembed_resources() -> None: - """Best-effort cleanup for fastembed/ONNX resources (no-op for other backends).""" - try: - from codexlens.semantic.embedder import clear_embedder_cache - clear_embedder_cache() - except Exception: - pass - - -def _cleanup_splade_resources() -> None: - """Release SPLADE encoder ONNX resources.""" - try: - from codexlens.semantic.splade_encoder import clear_splade_cache - clear_splade_cache() - except Exception: - pass - - -def _generate_chunks_from_cursor( - cursor, - chunker, - path_column: str, - file_batch_size: int, - failed_files: List[Tuple[str, str]], -) -> Generator[Tuple, None, Tuple[int, int]]: - """Generator that yields chunks from database cursor in a streaming fashion. - - This avoids loading all chunks into memory at once, significantly reducing - peak memory usage for large codebases. - - Args: - cursor: SQLite cursor with file data - chunker: Chunker instance for splitting files - path_column: Column name for file path - file_batch_size: Number of files to fetch at a time - failed_files: List to append failed files to - - Yields: - (chunk, file_path) tuples - - Returns: - (total_files_processed, batch_count) after iteration completes - """ - total_files = 0 - batch_count = 0 - - while True: - file_batch = cursor.fetchmany(file_batch_size) - if not file_batch: - break - - batch_count += 1 - - for file_row in file_batch: - file_path = file_row[path_column] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - if chunks: - total_files += 1 - for chunk in chunks: - yield (chunk, file_path) - except Exception as e: - logger.error(f"Failed to chunk {file_path}: {e}") - failed_files.append((file_path, str(e))) - - -def _create_token_aware_batches( - chunk_generator: Generator, - max_tokens_per_batch: int = 8000, -) -> Generator[List[Tuple], None, None]: - """Group chunks by total token count instead of fixed count. - - Uses fast token estimation (len(content) // 4) for efficiency. - Yields batches when approaching the token limit. - - Args: - chunk_generator: Generator yielding (chunk, file_path) tuples - max_tokens_per_batch: Maximum tokens per batch (default: 8000) - - Yields: - List of (chunk, file_path) tuples representing a batch - """ - current_batch = [] - current_tokens = 0 - - for chunk, file_path in chunk_generator: - # Fast token estimation: len(content) // 4 - chunk_tokens = len(chunk.content) // 4 - - # If adding this chunk would exceed limit and we have items, yield current batch - if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch: - yield current_batch - current_batch = [] - current_tokens = 0 - - # Add chunk to current batch - current_batch.append((chunk, file_path)) - current_tokens += chunk_tokens - - # Yield final batch if not empty - if current_batch: - yield current_batch - - -def _get_path_column(conn: sqlite3.Connection) -> str: - """Detect whether files table uses 'path' or 'full_path' column. - - Args: - conn: SQLite connection to the index database - - Returns: - Column name ('path' or 'full_path') - - Raises: - ValueError: If neither column exists in files table - """ - cursor = conn.execute("PRAGMA table_info(files)") - columns = {row[1] for row in cursor.fetchall()} - if 'full_path' in columns: - return 'full_path' - elif 'path' in columns: - return 'path' - raise ValueError("files table has neither 'path' nor 'full_path' column") - - -def check_index_embeddings(index_path: Path) -> Dict[str, any]: - """Check if an index has embeddings and return statistics. - - Args: - index_path: Path to _index.db file - - Returns: - Dictionary with embedding statistics and status - """ - if not index_path.exists(): - return { - "success": False, - "error": f"Index not found: {index_path}", - } - - try: - with sqlite3.connect(index_path) as conn: - # Check if semantic_chunks table exists - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - table_exists = cursor.fetchone() is not None - - if not table_exists: - # Count total indexed files even without embeddings - cursor = conn.execute("SELECT COUNT(*) FROM files") - total_files = cursor.fetchone()[0] - - return { - "success": True, - "result": { - "has_embeddings": False, - "total_chunks": 0, - "total_files": total_files, - "files_with_chunks": 0, - "files_without_chunks": total_files, - "coverage_percent": 0.0, - "missing_files_sample": [], - "index_path": str(index_path), - }, - } - - # Count total chunks - cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") - total_chunks = cursor.fetchone()[0] - - # Count total indexed files - cursor = conn.execute("SELECT COUNT(*) FROM files") - total_files = cursor.fetchone()[0] - - # Count files with embeddings - cursor = conn.execute( - "SELECT COUNT(DISTINCT file_path) FROM semantic_chunks" - ) - files_with_chunks = cursor.fetchone()[0] - - # Get a sample of files without embeddings - path_column = _get_path_column(conn) - cursor = conn.execute(f""" - SELECT {path_column} - FROM files - WHERE {path_column} NOT IN ( - SELECT DISTINCT file_path FROM semantic_chunks - ) - LIMIT 5 - """) - missing_files = [row[0] for row in cursor.fetchall()] - - return { - "success": True, - "result": { - "has_embeddings": total_chunks > 0, - "total_chunks": total_chunks, - "total_files": total_files, - "files_with_chunks": files_with_chunks, - "files_without_chunks": total_files - files_with_chunks, - "coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1), - "missing_files_sample": missing_files, - "index_path": str(index_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to check embeddings: {str(e)}", - } - - -def _get_embedding_defaults() -> tuple[str, str, bool, List, str, float]: - """Get default embedding settings from config. - - Returns: - Tuple of (backend, model, use_gpu, endpoints, strategy, cooldown) - """ - try: - from codexlens.config import Config - config = Config.load() - return ( - config.embedding_backend, - config.embedding_model, - config.embedding_use_gpu, - config.embedding_endpoints, - config.embedding_strategy, - config.embedding_cooldown, - ) - except Exception: - return "fastembed", "code", True, [], "latency_aware", 60.0 - - -def generate_embeddings( - index_path: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, - splade_db_path: Optional[Path] = None, -) -> Dict[str, any]: - """Generate embeddings for an index using memory-efficient batch processing. - - This function processes files in small batches to keep memory usage under 2GB, - regardless of the total project size. Supports concurrent API calls for - LiteLLM backend to improve throughput. - - Args: - index_path: Path to _index.db file - embedding_backend: Embedding backend to use (fastembed or litellm). - Defaults to config setting. - model_profile: Model profile for fastembed (fast, code, multilingual, balanced) - or model name for litellm (e.g., qwen3-embedding). - Defaults to config setting. - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters for sliding window chunking (default: 200) - progress_callback: Optional callback for progress updates - use_gpu: Whether to use GPU acceleration (fastembed only). - Defaults to config setting. - max_tokens_per_batch: Maximum tokens per batch for token-aware batching. - If None, attempts to get from embedder.max_tokens, - then falls back to 8000. If set, overrides automatic detection. - max_workers: Maximum number of concurrent API calls. - If None, uses dynamic defaults based on backend and endpoint count. - endpoints: Optional list of endpoint configurations for multi-API load balancing. - Each dict has keys: model, api_key, api_base, weight. - strategy: Selection strategy for multi-endpoint mode (round_robin, latency_aware). - cooldown: Default cooldown seconds for rate-limited endpoints. - splade_db_path: Optional path to centralized SPLADE database. If None, SPLADE - is written to index_path (legacy behavior). Use index_root / SPLADE_DB_NAME - for centralized storage. - - Returns: - Result dictionary with generation statistics - """ - # Get defaults from config if not specified - (default_backend, default_model, default_gpu, - default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults() - - if embedding_backend is None: - embedding_backend = default_backend - if model_profile is None: - model_profile = default_model - if use_gpu is None: - use_gpu = default_gpu - if endpoints is None: - endpoints = default_endpoints - if strategy is None: - strategy = default_strategy - if cooldown is None: - cooldown = default_cooldown - - # Calculate endpoint count for worker scaling - endpoint_count = len(endpoints) if endpoints else 1 - - # Set dynamic max_workers default based on backend type and endpoint count - # - FastEmbed: CPU-bound, sequential is optimal (1 worker) - # - LiteLLM single endpoint: 4 workers default - # - LiteLLM multi-endpoint: workers = endpoint_count * 2 (to saturate all APIs) - if max_workers is None: - if embedding_backend == "litellm": - if endpoint_count > 1: - max_workers = endpoint_count * 2 # No cap, scale with endpoints - else: - max_workers = 4 - else: - max_workers = 1 - - backend_available, backend_error = is_embedding_backend_available(embedding_backend) - if not backend_available: - return {"success": False, "error": backend_error or "Embedding backend not available"} - - if not index_path.exists(): - return { - "success": False, - "error": f"Index not found: {index_path}", - } - - # Check existing chunks - status = check_index_embeddings(index_path) - if not status["success"]: - return status - - existing_chunks = status["result"]["total_chunks"] - - if existing_chunks > 0 and not force: - return { - "success": False, - "error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.", - "existing_chunks": existing_chunks, - } - - if force and existing_chunks > 0: - if progress_callback: - progress_callback(f"Clearing {existing_chunks} existing chunks...") - - try: - with sqlite3.connect(index_path) as conn: - conn.execute("DELETE FROM semantic_chunks") - conn.commit() - except Exception as e: - return { - "success": False, - "error": f"Failed to clear existing chunks: {str(e)}", - } - - # Initialize components - try: - # Import factory function to support both backends - from codexlens.semantic.factory import get_embedder as get_embedder_factory - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - - # Initialize embedder using factory (supports fastembed, litellm, and rotational) - # For fastembed: model_profile is a profile name (fast/code/multilingual/balanced) - # For litellm: model_profile is a model name (e.g., qwen3-embedding) - # For multi-endpoint: endpoints list enables load balancing - if embedding_backend == "fastembed": - embedder = get_embedder_factory(backend="fastembed", profile=model_profile, use_gpu=use_gpu) - elif embedding_backend == "litellm": - embedder = get_embedder_factory( - backend="litellm", - model=model_profile, - endpoints=endpoints if endpoints else None, - strategy=strategy, - cooldown=cooldown, - ) - else: - return { - "success": False, - "error": f"Invalid embedding backend: {embedding_backend}. Must be 'fastembed' or 'litellm'.", - } - - # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken - # This significantly reduces CPU usage with minimal impact on metadata accuracy - # Load chunk stripping config from settings - from codexlens.config import Config - chunk_cfg = Config.load() - chunker = Chunker(config=ChunkConfig( - max_chunk_size=chunk_size, - overlap=overlap, - skip_token_count=True, - strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True), - strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True), - )) - - # Log embedder info with endpoint count for multi-endpoint mode - if progress_callback: - if endpoint_count > 1: - progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy") - progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") - - # Calculate dynamic batch size based on model capacity - from codexlens.config import Config - batch_config = Config.load() - effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) - - if progress_callback and batch_config.api_batch_size_dynamic: - progress_callback(f"Dynamic batch size: {effective_batch_size} (model max_tokens={getattr(embedder, 'max_tokens', 8192)})") - - except Exception as e: - return { - "success": False, - "error": f"Failed to initialize components: {str(e)}", - } - - # --- STREAMING PROCESSING --- - # Process files in batches to control memory usage - start_time = time.time() - failed_files = [] - total_chunks_created = 0 - total_files_processed = 0 - FILE_BATCH_SIZE = 100 # Process 100 files at a time - # effective_batch_size is calculated above (dynamic or EMBEDDING_BATCH_SIZE fallback) - - try: - with VectorStore(index_path) as vector_store: - # Check model compatibility with existing embeddings - if not force: - is_compatible, warning = vector_store.check_model_compatibility( - model_profile, embedder.model_name, embedder.embedding_dim - ) - if not is_compatible: - return { - "success": False, - "error": warning, - } - - # Set/update model configuration for this index - vector_store.set_model_config( - model_profile, embedder.model_name, embedder.embedding_dim, backend=embedding_backend - ) - # Use bulk insert mode for efficient batch ANN index building - # This defers ANN updates until end_bulk_insert() is called - with vector_store.bulk_insert(): - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - path_column = _get_path_column(conn) - - # Get total file count for progress reporting - total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0] - if total_files == 0: - return {"success": False, "error": "No files found in index"} - - if progress_callback: - # Format must match Node.js parseProgressLine: "Processing N files" with 'embed' keyword - progress_callback(f"Processing {total_files} files for embeddings in batches of {FILE_BATCH_SIZE}...") - - cursor = conn.execute(f"SELECT {path_column}, content, language FROM files") - - # --- STREAMING GENERATOR APPROACH --- - # Instead of accumulating all chunks from 100 files, we use a generator - # that yields chunks on-demand, keeping memory usage low and constant. - chunk_generator = _generate_chunks_from_cursor( - cursor, chunker, path_column, FILE_BATCH_SIZE, failed_files - ) - - # Determine max tokens per batch - # Priority: explicit parameter > embedder.max_tokens > default 8000 - if max_tokens_per_batch is None: - max_tokens_per_batch = getattr(embedder, 'max_tokens', 8000) - - # Create token-aware batches or fall back to fixed-size batching - if max_tokens_per_batch: - batch_generator = _create_token_aware_batches( - chunk_generator, max_tokens_per_batch - ) - else: - # Fallback to fixed-size batching for backward compatibility - def fixed_size_batches(): - while True: - batch = list(islice(chunk_generator, effective_batch_size)) - if not batch: - break - yield batch - batch_generator = fixed_size_batches() - - batch_number = 0 - files_seen = set() - - def compute_embeddings_only(batch_data: Tuple[int, List[Tuple]]): - """Compute embeddings for a batch (no DB write) with retry logic. - - Args: - batch_data: Tuple of (batch_number, chunk_batch) - - Returns: - Tuple of (batch_num, chunk_batch, embeddings_numpy, batch_files, error) - """ - import random - - batch_num, chunk_batch = batch_data - batch_files = set() - for _, file_path in chunk_batch: - batch_files.add(file_path) - - max_retries = 5 - base_delay = 2.0 - - for attempt in range(max_retries + 1): - try: - batch_contents = [chunk.content for chunk, _ in chunk_batch] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - return batch_num, chunk_batch, embeddings_numpy, batch_files, None - - except Exception as e: - error_str = str(e).lower() - # Check for retryable errors (rate limit, connection, backend issues) - # Note: Some backends (e.g., ModelScope) return 400 with nested 500 errors - is_retryable = any(x in error_str for x in [ - "429", "rate limit", "connection", "timeout", - "502", "503", "504", "service unavailable", - "500", "400", "badrequesterror", "internal server error", - "11434" # Ollama port - indicates backend routing issue - ]) - - if attempt < max_retries and is_retryable: - sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5) - logger.warning(f"Batch {batch_num} failed (attempt {attempt+1}/{max_retries+1}). " - f"Retrying in {sleep_time:.1f}s. Error: {e}") - time.sleep(sleep_time) - continue - - error_msg = f"Batch {batch_num}: {str(e)}" - logger.error(f"Failed to compute embeddings for batch {batch_num}: {str(e)}") - return batch_num, chunk_batch, None, batch_files, error_msg - - # Should not reach here, but just in case - return batch_num, chunk_batch, None, batch_files, f"Batch {batch_num}: Max retries exceeded" - - # Process batches based on max_workers setting - if max_workers <= 1: - # Sequential processing - stream directly from generator (no pre-materialization) - for chunk_batch in batch_generator: - batch_number += 1 - - # Track files in this batch - batch_files = set() - for _, file_path in chunk_batch: - batch_files.add(file_path) - - # Retry logic for transient backend errors - max_retries = 5 - base_delay = 2.0 - success = False - - for attempt in range(max_retries + 1): - try: - # Generate embeddings - batch_contents = [chunk.content for chunk, _ in chunk_batch] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - - # Store embeddings with category - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - - files_seen.update(batch_files) - total_chunks_created += len(chunk_batch) - total_files_processed = len(files_seen) - success = True - break - - except Exception as e: - error_str = str(e).lower() - # Check for retryable errors (rate limit, connection, backend issues) - is_retryable = any(x in error_str for x in [ - "429", "rate limit", "connection", "timeout", - "502", "503", "504", "service unavailable", - "500", "400", "badrequesterror", "internal server error", - "11434" # Ollama port - indicates backend routing issue - ]) - - if attempt < max_retries and is_retryable: - import random - sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5) - logger.warning(f"Batch {batch_number} failed (attempt {attempt+1}/{max_retries+1}). " - f"Retrying in {sleep_time:.1f}s. Error: {e}") - time.sleep(sleep_time) - continue - - logger.error(f"Failed to process batch {batch_number}: {str(e)}") - files_seen.update(batch_files) - break - - if success and progress_callback and batch_number % 10 == 0: - progress_callback(f" Batch {batch_number}: {total_chunks_created} chunks, {total_files_processed} files") - else: - # Concurrent processing - main thread iterates batches (SQLite safe), - # workers compute embeddings (parallel), main thread writes to DB (serial) - if progress_callback: - progress_callback(f"Processing with {max_workers} concurrent embedding workers...") - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - pending_futures = {} # future -> (batch_num, chunk_batch) - completed_batches = 0 - last_reported_batch = 0 - - def process_completed_futures(): - """Process any completed futures and write to DB.""" - nonlocal total_chunks_created, total_files_processed, completed_batches, last_reported_batch - done_futures = [f for f in pending_futures if f.done()] - for f in done_futures: - try: - batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result() - if embeddings_numpy is not None and error is None: - # Write to DB in main thread (no contention) - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - total_chunks_created += len(chunk_batch) - files_seen.update(batch_files) - total_files_processed = len(files_seen) - completed_batches += 1 - except Exception as e: - logger.error(f"Future raised exception: {e}") - completed_batches += 1 - del pending_futures[f] - - # Report progress based on completed batches (every 5 batches) - if progress_callback and completed_batches >= last_reported_batch + 5: - progress_callback(f" Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files") - last_reported_batch = completed_batches - - # Iterate batches in main thread (SQLite cursor is main-thread bound) - for chunk_batch in batch_generator: - batch_number += 1 - - # Submit compute task to worker pool - future = executor.submit(compute_embeddings_only, (batch_number, chunk_batch)) - pending_futures[future] = batch_number - - # Process any completed futures to free memory and write to DB - process_completed_futures() - - # Backpressure: wait if too many pending - while len(pending_futures) >= max_workers * 2: - process_completed_futures() - if len(pending_futures) >= max_workers * 2: - time.sleep(0.1) # time is imported at module level - - # Wait for remaining futures - for future in as_completed(list(pending_futures.keys())): - try: - batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result() - if embeddings_numpy is not None and error is None: - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - total_chunks_created += len(chunk_batch) - files_seen.update(batch_files) - total_files_processed = len(files_seen) - completed_batches += 1 - - # Report progress for remaining batches - if progress_callback and completed_batches >= last_reported_batch + 5: - progress_callback(f" Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files") - last_reported_batch = completed_batches - except Exception as e: - logger.error(f"Future raised exception: {e}") - - # Notify before ANN index finalization (happens when bulk_insert context exits) - if progress_callback: - progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks") - - # --- SPLADE SPARSE ENCODING (after dense embeddings) --- - # Add SPLADE encoding if enabled in config - splade_success = False - splade_error = None - - try: - from codexlens.config import Config, SPLADE_DB_NAME - config = Config.load() - - if config.enable_splade: - from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder - from codexlens.storage.splade_index import SpladeIndex - - ok, err = check_splade_available() - if ok: - if progress_callback: - progress_callback(f"Generating SPLADE sparse vectors for {total_chunks_created} chunks...") - - # Initialize SPLADE encoder and index - splade_encoder = get_splade_encoder(use_gpu=use_gpu) - # Use centralized SPLADE database if provided, otherwise fallback to index_path - effective_splade_path = splade_db_path if splade_db_path else index_path - splade_index = SpladeIndex(effective_splade_path) - splade_index.create_tables() - - # Retrieve all chunks from database for SPLADE encoding - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - cursor = conn.execute("SELECT id, content FROM semantic_chunks ORDER BY id") - - # Batch encode for efficiency - SPLADE_BATCH_SIZE = 32 - batch_postings = [] - chunk_batch = [] - chunk_ids = [] - - for row in cursor: - chunk_id = row["id"] - content = row["content"] - - chunk_ids.append(chunk_id) - chunk_batch.append(content) - - # Process batch when full - if len(chunk_batch) >= SPLADE_BATCH_SIZE: - sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE) - for cid, sparse_vec in zip(chunk_ids, sparse_vecs): - batch_postings.append((cid, sparse_vec)) - - chunk_batch = [] - chunk_ids = [] - - # Process remaining chunks - if chunk_batch: - sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE) - for cid, sparse_vec in zip(chunk_ids, sparse_vecs): - batch_postings.append((cid, sparse_vec)) - - # Batch insert all postings - if batch_postings: - splade_index.add_postings_batch(batch_postings) - - # Set metadata - splade_index.set_metadata( - model_name=splade_encoder.model_name, - vocab_size=splade_encoder.vocab_size - ) - - splade_success = True - if progress_callback: - stats = splade_index.get_stats() - progress_callback( - f"SPLADE index created: {stats['total_postings']} postings, " - f"{stats['unique_tokens']} unique tokens" - ) - else: - logger.debug("SPLADE not available: %s", err) - splade_error = f"SPLADE not available: {err}" - except Exception as e: - splade_error = str(e) - logger.warning("SPLADE encoding failed: %s", e) - - # Report SPLADE status after processing - if progress_callback and not splade_success and splade_error: - progress_callback(f"SPLADE index: FAILED - {splade_error}") - - except Exception as e: - # Cleanup on error to prevent process hanging - try: - _cleanup_fastembed_resources() - _cleanup_splade_resources() - gc.collect() - except Exception: - pass - return {"success": False, "error": f"Failed to read or process files: {str(e)}"} - - elapsed_time = time.time() - start_time - - # Final cleanup: release ONNX resources to allow process exit - # This is critical - without it, ONNX Runtime threads prevent Python from exiting - try: - _cleanup_fastembed_resources() - _cleanup_splade_resources() - gc.collect() - except Exception: - pass - - return { - "success": True, - "result": { - "chunks_created": total_chunks_created, - "files_processed": total_files_processed, - "files_failed": len(failed_files), - "elapsed_time": elapsed_time, - "model_profile": model_profile, - "model_name": embedder.model_name, - "failed_files": failed_files[:5], # First 5 failures - "index_path": str(index_path), - }, - } - - -def _discover_index_dbs_internal(index_root: Path) -> List[Path]: - """Internal helper to find all _index.db files (no deprecation warning). - - Used internally by generate_dense_embeddings_centralized. - - Args: - index_root: Root directory to scan for _index.db files - - Returns: - Sorted list of paths to _index.db files - """ - if not index_root.exists(): - return [] - - return sorted(index_root.rglob("_index.db")) - - -def discover_all_index_dbs(index_root: Path) -> List[Path]: - """Recursively find all _index.db files in an index tree. - - .. deprecated:: - This function is deprecated. Use centralized indexing with - ``generate_dense_embeddings_centralized`` instead, which handles - index discovery internally. - - Args: - index_root: Root directory to scan for _index.db files - - Returns: - Sorted list of paths to _index.db files - """ - import warnings - warnings.warn( - "discover_all_index_dbs is deprecated. Use centralized indexing with " - "generate_dense_embeddings_centralized instead.", - DeprecationWarning, - stacklevel=2 - ) - return _discover_index_dbs_internal(index_root) - - -def find_all_indexes(scan_dir: Path) -> List[Path]: - """Find all _index.db files in directory tree. - - Args: - scan_dir: Directory to scan - - Returns: - List of paths to _index.db files - """ - if not scan_dir.exists(): - return [] - - return list(scan_dir.rglob("_index.db")) - - - -def generate_embeddings_recursive( - index_root: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, -) -> Dict[str, any]: - """Generate embeddings for all index databases in a project recursively. - - .. deprecated:: - This function is deprecated. Use ``generate_dense_embeddings_centralized`` - instead, which creates a single centralized vector index for the entire project - rather than per-directory indexes. - - Args: - index_root: Root index directory containing _index.db files - embedding_backend: Embedding backend to use (fastembed or litellm). - Defaults to config setting. - model_profile: Model profile for fastembed (fast, code, multilingual, balanced) - or model name for litellm (e.g., qwen3-embedding). - Defaults to config setting. - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters for sliding window chunking (default: 200) - progress_callback: Optional callback for progress updates - use_gpu: Whether to use GPU acceleration (fastembed only). - Defaults to config setting. - max_tokens_per_batch: Maximum tokens per batch for token-aware batching. - If None, attempts to get from embedder.max_tokens, - then falls back to 8000. If set, overrides automatic detection. - max_workers: Maximum number of concurrent API calls. - If None, uses dynamic defaults based on backend and endpoint count. - endpoints: Optional list of endpoint configurations for multi-API load balancing. - strategy: Selection strategy for multi-endpoint mode. - cooldown: Default cooldown seconds for rate-limited endpoints. - - Returns: - Aggregated result dictionary with generation statistics - """ - import warnings - warnings.warn( - "generate_embeddings_recursive is deprecated. Use " - "generate_dense_embeddings_centralized instead for centralized indexing.", - DeprecationWarning, - stacklevel=2 - ) - - # Get defaults from config if not specified - (default_backend, default_model, default_gpu, - default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults() - - if embedding_backend is None: - embedding_backend = default_backend - if model_profile is None: - model_profile = default_model - if use_gpu is None: - use_gpu = default_gpu - if endpoints is None: - endpoints = default_endpoints - if strategy is None: - strategy = default_strategy - if cooldown is None: - cooldown = default_cooldown - - # Calculate endpoint count for worker scaling - endpoint_count = len(endpoints) if endpoints else 1 - - # Set dynamic max_workers default based on backend type and endpoint count - if max_workers is None: - if embedding_backend == "litellm": - if endpoint_count > 1: - max_workers = endpoint_count * 2 # No cap, scale with endpoints - else: - max_workers = 4 - else: - max_workers = 1 - - # Discover all _index.db files (using internal helper to avoid double deprecation warning) - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "success": False, - "error": f"No index databases found in {index_root}", - } - - if progress_callback: - progress_callback(f"Found {len(index_files)} index databases to process") - - # Calculate centralized SPLADE database path - from codexlens.config import SPLADE_DB_NAME - splade_db_path = index_root / SPLADE_DB_NAME - - # Process each index database - all_results = [] - total_chunks = 0 - total_files_processed = 0 - total_files_failed = 0 - - for idx, index_path in enumerate(index_files, 1): - if progress_callback: - try: - rel_path = index_path.relative_to(index_root) - except ValueError: - rel_path = index_path - # Format: "Processing file X/Y: path" to match Node.js parseProgressLine - progress_callback(f"Processing file {idx}/{len(index_files)}: {rel_path}") - - result = generate_embeddings( - index_path, - embedding_backend=embedding_backend, - model_profile=model_profile, - force=force, - chunk_size=chunk_size, - overlap=overlap, - progress_callback=None, # Don't cascade callbacks - use_gpu=use_gpu, - max_tokens_per_batch=max_tokens_per_batch, - max_workers=max_workers, - endpoints=endpoints, - strategy=strategy, - cooldown=cooldown, - splade_db_path=splade_db_path, # Use centralized SPLADE storage - ) - - all_results.append({ - "path": str(index_path), - "success": result["success"], - "result": result.get("result"), - "error": result.get("error"), - }) - - if result["success"]: - data = result["result"] - total_chunks += data["chunks_created"] - total_files_processed += data["files_processed"] - total_files_failed += data["files_failed"] - - successful = sum(1 for r in all_results if r["success"]) - - # Final cleanup after processing all indexes - # Each generate_embeddings() call does its own cleanup, but do a final one to be safe - try: - _cleanup_fastembed_resources() - _cleanup_splade_resources() - gc.collect() - except Exception: - pass - - return { - "success": successful > 0, - "result": { - "indexes_processed": len(index_files), - "indexes_successful": successful, - "indexes_failed": len(index_files) - successful, - "total_chunks_created": total_chunks, - "total_files_processed": total_files_processed, - "total_files_failed": total_files_failed, - "model_profile": model_profile, - "details": all_results, - }, - } - - -def generate_dense_embeddings_centralized( - index_root: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, -) -> Dict[str, any]: - """Generate dense embeddings with centralized vector storage. - - This function creates a single HNSW index at the project root instead of - per-directory indexes. All chunks from all _index.db files are combined - into one central _vectors.hnsw file. - - Target architecture: - / - |-- _vectors.hnsw # Centralized dense vector ANN index - |-- _splade.db # Centralized sparse vector index - |-- src/ - |-- _index.db # No longer contains .hnsw file - - Args: - index_root: Root index directory containing _index.db files - embedding_backend: Embedding backend (fastembed or litellm) - model_profile: Model profile or name - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters - progress_callback: Optional callback for progress updates - use_gpu: Whether to use GPU acceleration - max_tokens_per_batch: Maximum tokens per batch - max_workers: Maximum concurrent workers - endpoints: Multi-endpoint configurations - strategy: Endpoint selection strategy - cooldown: Rate-limit cooldown seconds - - Returns: - Result dictionary with generation statistics - """ - from codexlens.config import VECTORS_HNSW_NAME, SPLADE_DB_NAME - - # Get defaults from config if not specified - (default_backend, default_model, default_gpu, - default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults() - - if embedding_backend is None: - embedding_backend = default_backend - if model_profile is None: - model_profile = default_model - if use_gpu is None: - use_gpu = default_gpu - if endpoints is None: - endpoints = default_endpoints - if strategy is None: - strategy = default_strategy - if cooldown is None: - cooldown = default_cooldown - - # Calculate endpoint count for worker scaling - endpoint_count = len(endpoints) if endpoints else 1 - - if max_workers is None: - if embedding_backend == "litellm": - if endpoint_count > 1: - max_workers = endpoint_count * 2 - else: - max_workers = 4 - else: - max_workers = 1 - - backend_available, backend_error = is_embedding_backend_available(embedding_backend) - if not backend_available: - return {"success": False, "error": backend_error or "Embedding backend not available"} - - # Discover all _index.db files - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "success": False, - "error": f"No index databases found in {index_root}", - } - - if progress_callback: - progress_callback(f"Found {len(index_files)} index databases for centralized embedding") - - # Pre-calculate estimated chunk count for HNSW capacity - # This avoids expensive resize operations during indexing - estimated_total_files = 0 - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - cursor = conn.execute("SELECT COUNT(*) FROM files") - estimated_total_files += cursor.fetchone()[0] - except Exception: - pass - # Heuristic: ~15 chunks per file on average - estimated_chunks = max(100000, estimated_total_files * 15) - - if progress_callback: - progress_callback(f"Estimated {estimated_total_files} files, ~{estimated_chunks} chunks") - - # Check for existing centralized index - central_hnsw_path = index_root / VECTORS_HNSW_NAME - if central_hnsw_path.exists() and not force: - return { - "success": False, - "error": f"Centralized vector index already exists at {central_hnsw_path}. Use --force to regenerate.", - } - - # Initialize embedder - try: - from codexlens.semantic.factory import get_embedder as get_embedder_factory - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.semantic.ann_index import ANNIndex - - if embedding_backend == "fastembed": - embedder = get_embedder_factory(backend="fastembed", profile=model_profile, use_gpu=use_gpu) - elif embedding_backend == "litellm": - embedder = get_embedder_factory( - backend="litellm", - model=model_profile, - endpoints=endpoints if endpoints else None, - strategy=strategy, - cooldown=cooldown, - ) - else: - return { - "success": False, - "error": f"Invalid embedding backend: {embedding_backend}", - } - - # Load chunk stripping config from settings - from codexlens.config import Config - chunk_cfg = Config.load() - chunker = Chunker(config=ChunkConfig( - max_chunk_size=chunk_size, - overlap=overlap, - skip_token_count=True, - strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True), - strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True), - )) - - if progress_callback: - if endpoint_count > 1: - progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy") - progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") - - # Calculate dynamic batch size based on model capacity - batch_config = chunk_cfg # Reuse already loaded config - effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) - - if progress_callback and batch_config.api_batch_size_dynamic: - progress_callback(f"Dynamic batch size: {effective_batch_size} (model max_tokens={getattr(embedder, 'max_tokens', 8192)})") - - except Exception as e: - return { - "success": False, - "error": f"Failed to initialize components: {str(e)}", - } - - # Create centralized ANN index with pre-calculated capacity - # Using estimated_chunks avoids expensive resize operations during indexing - central_ann_index = ANNIndex.create_central( - index_root=index_root, - dim=embedder.embedding_dim, - initial_capacity=estimated_chunks, - auto_save=False, - ) - - # Process all index databases - start_time = time.time() - failed_files = [] - total_chunks_created = 0 - total_files_processed = 0 - all_chunk_ids = [] - all_embeddings = [] - - # Track chunk ID to file_path mapping for metadata - chunk_id_to_info: Dict[int, Dict[str, Any]] = {} - next_chunk_id = 1 - # Track current index_path for source_index_db field - current_index_path: Optional[str] = None - - for idx, index_path in enumerate(index_files, 1): - if progress_callback: - try: - rel_path = index_path.relative_to(index_root) - except ValueError: - rel_path = index_path - progress_callback(f"Processing {idx}/{len(index_files)}: {rel_path}") - - # Track current index_path for source_index_db - current_index_path = str(index_path) - - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - path_column = _get_path_column(conn) - - # Get files from this index - cursor = conn.execute(f"SELECT {path_column}, content, language FROM files") - file_rows = cursor.fetchall() - - for file_row in file_rows: - file_path = file_row[path_column] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - - if not chunks: - continue - - total_files_processed += 1 - - # Generate embeddings for this file's chunks - batch_contents = [chunk.content for chunk in chunks] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - - # Assign chunk IDs and store embeddings - for i, chunk in enumerate(chunks): - chunk_id = next_chunk_id - next_chunk_id += 1 - - all_chunk_ids.append(chunk_id) - all_embeddings.append(embeddings_numpy[i]) - - # Store metadata for later retrieval - chunk_id_to_info[chunk_id] = { - "file_path": file_path, - "content": chunk.content, - "metadata": chunk.metadata, - "category": get_file_category(file_path) or "code", - "source_index_db": current_index_path, - } - total_chunks_created += 1 - - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - failed_files.append((file_path, str(e))) - - except Exception as e: - logger.error(f"Failed to read index {index_path}: {e}") - failed_files.append((str(index_path), str(e))) - - # Add all embeddings to centralized ANN index - if all_embeddings: - if progress_callback: - progress_callback(f"Building centralized ANN index with {len(all_embeddings)} vectors...") - - try: - import numpy as np - embeddings_matrix = np.vstack(all_embeddings) - central_ann_index.add_vectors(all_chunk_ids, embeddings_matrix) - central_ann_index.save() - - if progress_callback: - progress_callback(f"Saved centralized index to {central_hnsw_path}") - - except Exception as e: - return { - "success": False, - "error": f"Failed to build centralized ANN index: {str(e)}", - } - - # Store chunk metadata in a centralized metadata database - vectors_meta_path = index_root / VECTORS_META_DB_NAME - if chunk_id_to_info: - if progress_callback: - progress_callback(f"Storing {len(chunk_id_to_info)} chunk metadata records...") - - try: - from codexlens.storage.vector_meta_store import VectorMetadataStore - - with VectorMetadataStore(vectors_meta_path) as meta_store: - # Convert chunk_id_to_info dict to list of dicts for batch insert - chunks_to_store = [] - for cid, info in chunk_id_to_info.items(): - metadata = info.get("metadata", {}) - chunks_to_store.append({ - "chunk_id": cid, - "file_path": info["file_path"], - "content": info["content"], - "start_line": metadata.get("start_line"), - "end_line": metadata.get("end_line"), - "category": info.get("category"), - "metadata": metadata, - "source_index_db": info.get("source_index_db"), - }) - - meta_store.add_chunks(chunks_to_store) - - if progress_callback: - progress_callback(f"Saved metadata to {vectors_meta_path}") - - except Exception as e: - logger.warning("Failed to store vector metadata: %s", e) - # Non-fatal: continue without centralized metadata - - # --- Binary Vector Generation for Cascade Search (Memory-Mapped) --- - binary_success = False - binary_count = 0 - try: - from codexlens.config import Config, BINARY_VECTORS_MMAP_NAME - config = Config.load() - - if getattr(config, 'enable_binary_cascade', True) and all_embeddings: - import numpy as np - - if progress_callback: - progress_callback(f"Generating binary vectors for {len(all_embeddings)} chunks...") - - # Binarize dense vectors: sign(x) -> 1 if x > 0, 0 otherwise - # Pack into bytes for efficient storage and Hamming distance computation - embeddings_matrix = np.vstack(all_embeddings) - binary_matrix = (embeddings_matrix > 0).astype(np.uint8) - - # Pack bits into bytes (8 bits per byte) - vectorized for all rows - packed_matrix = np.packbits(binary_matrix, axis=1) - binary_count = len(packed_matrix) - - # Save as memory-mapped file for efficient loading - binary_mmap_path = index_root / BINARY_VECTORS_MMAP_NAME - mmap_array = np.memmap( - str(binary_mmap_path), - dtype=np.uint8, - mode='w+', - shape=packed_matrix.shape - ) - mmap_array[:] = packed_matrix - mmap_array.flush() - del mmap_array # Close the memmap - - # Save metadata (shape and chunk_ids) to sidecar JSON - import json - meta_path = binary_mmap_path.with_suffix('.meta.json') - with open(meta_path, 'w') as f: - json.dump({ - 'shape': list(packed_matrix.shape), - 'chunk_ids': all_chunk_ids, - 'embedding_dim': embeddings_matrix.shape[1], - }, f) - - # Also store in DB for backward compatibility - from codexlens.storage.vector_meta_store import VectorMetadataStore - binary_packed_bytes = [row.tobytes() for row in packed_matrix] - with VectorMetadataStore(vectors_meta_path) as meta_store: - meta_store.add_binary_vectors(all_chunk_ids, binary_packed_bytes) - - binary_success = True - if progress_callback: - progress_callback(f"Generated {binary_count} binary vectors ({embeddings_matrix.shape[1]} dims -> {packed_matrix.shape[1]} bytes, mmap: {binary_mmap_path.name})") - - except Exception as e: - logger.warning("Binary vector generation failed: %s", e) - # Non-fatal: continue without binary vectors - - # --- SPLADE Sparse Index Generation (Centralized) --- - splade_success = False - splade_chunks_count = 0 - try: - from codexlens.config import Config - config = Config.load() - - if config.enable_splade and chunk_id_to_info: - from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder - from codexlens.storage.splade_index import SpladeIndex - import json - - ok, err = check_splade_available() - if ok: - if progress_callback: - progress_callback(f"Generating SPLADE sparse vectors for {len(chunk_id_to_info)} chunks...") - - # Initialize SPLADE encoder and index - splade_encoder = get_splade_encoder(use_gpu=use_gpu) - splade_db_path = index_root / SPLADE_DB_NAME - splade_index = SpladeIndex(splade_db_path) - splade_index.create_tables() - - # Batch encode for efficiency - SPLADE_BATCH_SIZE = 32 - all_postings = [] - all_chunk_metadata = [] - - # Create batches from chunk_id_to_info - chunk_items = list(chunk_id_to_info.items()) - - for i in range(0, len(chunk_items), SPLADE_BATCH_SIZE): - batch_items = chunk_items[i:i + SPLADE_BATCH_SIZE] - chunk_ids = [item[0] for item in batch_items] - chunk_contents = [item[1]["content"] for item in batch_items] - - # Generate sparse vectors - sparse_vecs = splade_encoder.encode_batch(chunk_contents, batch_size=SPLADE_BATCH_SIZE) - for cid, sparse_vec in zip(chunk_ids, sparse_vecs): - all_postings.append((cid, sparse_vec)) - - if progress_callback and (i + SPLADE_BATCH_SIZE) % 100 == 0: - progress_callback(f"SPLADE encoding: {min(i + SPLADE_BATCH_SIZE, len(chunk_items))}/{len(chunk_items)}") - - # Batch insert all postings - if all_postings: - splade_index.add_postings_batch(all_postings) - - # CRITICAL FIX: Populate splade_chunks table - for cid, info in chunk_id_to_info.items(): - metadata_str = json.dumps(info.get("metadata", {})) if info.get("metadata") else None - all_chunk_metadata.append(( - cid, - info["file_path"], - info["content"], - metadata_str, - info.get("source_index_db") - )) - - if all_chunk_metadata: - splade_index.add_chunks_metadata_batch(all_chunk_metadata) - splade_chunks_count = len(all_chunk_metadata) - - # Set metadata - splade_index.set_metadata( - model_name=splade_encoder.model_name, - vocab_size=splade_encoder.vocab_size - ) - - splade_index.close() - splade_success = True - - if progress_callback: - progress_callback(f"SPLADE index created: {len(all_postings)} postings, {splade_chunks_count} chunks") - - else: - if progress_callback: - progress_callback(f"SPLADE not available, skipping sparse index: {err}") - - except Exception as e: - logger.warning("SPLADE encoding failed: %s", e) - if progress_callback: - progress_callback(f"SPLADE encoding failed: {e}") - - elapsed_time = time.time() - start_time - - # Cleanup - try: - _cleanup_fastembed_resources() - gc.collect() - except Exception: - pass - - return { - "success": True, - "result": { - "chunks_created": total_chunks_created, - "files_processed": total_files_processed, - "files_failed": len(failed_files), - "elapsed_time": elapsed_time, - "model_profile": model_profile, - "model_name": embedder.model_name, - "central_index_path": str(central_hnsw_path), - "failed_files": failed_files[:5], - "splade_success": splade_success, - "splade_chunks": splade_chunks_count, - "binary_success": binary_success, - "binary_count": binary_count, - }, - } - - -def get_embeddings_status(index_root: Path) -> Dict[str, any]: - """Get comprehensive embeddings coverage status for all indexes. - - Args: - index_root: Root index directory - - Returns: - Aggregated status with coverage statistics, model info, and timestamps - """ - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "success": True, - "result": { - "total_indexes": 0, - "total_files": 0, - "files_with_embeddings": 0, - "files_without_embeddings": 0, - "total_chunks": 0, - "coverage_percent": 0.0, - "indexes_with_embeddings": 0, - "indexes_without_embeddings": 0, - "model_info": None, - }, - } - - total_files = 0 - files_with_embeddings = 0 - total_chunks = 0 - indexes_with_embeddings = 0 - model_info = None - latest_updated_at = None - - for index_path in index_files: - status = check_index_embeddings(index_path) - if status["success"]: - result = status["result"] - total_files += result["total_files"] - files_with_embeddings += result["files_with_chunks"] - total_chunks += result["total_chunks"] - if result["has_embeddings"]: - indexes_with_embeddings += 1 - - # Get model config from first index with embeddings (they should all match) - if model_info is None: - try: - from codexlens.semantic.vector_store import VectorStore - with VectorStore(index_path) as vs: - config = vs.get_model_config() - if config: - model_info = { - "model_profile": config.get("model_profile"), - "model_name": config.get("model_name"), - "embedding_dim": config.get("embedding_dim"), - "backend": config.get("backend"), - "created_at": config.get("created_at"), - "updated_at": config.get("updated_at"), - } - latest_updated_at = config.get("updated_at") - except Exception: - pass - else: - # Track the latest updated_at across all indexes - try: - from codexlens.semantic.vector_store import VectorStore - with VectorStore(index_path) as vs: - config = vs.get_model_config() - if config and config.get("updated_at"): - if latest_updated_at is None or config["updated_at"] > latest_updated_at: - latest_updated_at = config["updated_at"] - except Exception: - pass - - # Update model_info with latest timestamp - if model_info and latest_updated_at: - model_info["updated_at"] = latest_updated_at - - return { - "success": True, - "result": { - "total_indexes": len(index_files), - "total_files": total_files, - "files_with_embeddings": files_with_embeddings, - "files_without_embeddings": total_files - files_with_embeddings, - "total_chunks": total_chunks, - "coverage_percent": round((files_with_embeddings / total_files * 100) if total_files > 0 else 0, 1), - "indexes_with_embeddings": indexes_with_embeddings, - "indexes_without_embeddings": len(index_files) - indexes_with_embeddings, - "model_info": model_info, - }, - } - - -def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]: - """Get summary statistics for all indexes in root directory. - - Args: - index_root: Root directory containing indexes - - Returns: - Summary statistics for all indexes - """ - indexes = find_all_indexes(index_root) - - if not indexes: - return { - "success": True, - "result": { - "total_indexes": 0, - "indexes_with_embeddings": 0, - "total_chunks": 0, - "indexes": [], - }, - } - - total_chunks = 0 - indexes_with_embeddings = 0 - index_stats = [] - - for index_path in indexes: - status = check_index_embeddings(index_path) - - if status["success"]: - result = status["result"] - has_emb = result["has_embeddings"] - chunks = result["total_chunks"] - - if has_emb: - indexes_with_embeddings += 1 - total_chunks += chunks - - # Extract project name from path - project_name = index_path.parent.name - - index_stats.append({ - "project": project_name, - "path": str(index_path), - "has_embeddings": has_emb, - "total_chunks": chunks, - "total_files": result["total_files"], - "coverage_percent": result.get("coverage_percent", 0), - }) - - return { - "success": True, - "result": { - "total_indexes": len(indexes), - "indexes_with_embeddings": indexes_with_embeddings, - "total_chunks": total_chunks, - "indexes": index_stats, - }, - } - - -def scan_for_model_conflicts( - index_root: Path, - target_backend: str, - target_model: str, -) -> Dict[str, any]: - """Scan for model conflicts across all indexes in a directory. - - Checks if any existing embeddings were generated with a different - backend or model than the target configuration. - - Args: - index_root: Root index directory to scan - target_backend: Target embedding backend (fastembed or litellm) - target_model: Target model profile/name - - Returns: - Dictionary with: - - has_conflict: True if any index has different model config - - existing_config: Config from first index with embeddings (if any) - - target_config: The requested configuration - - conflicts: List of conflicting index paths with their configs - - indexes_with_embeddings: Count of indexes that have embeddings - """ - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "has_conflict": False, - "existing_config": None, - "target_config": {"backend": target_backend, "model": target_model}, - "conflicts": [], - "indexes_with_embeddings": 0, - } - - conflicts = [] - existing_config = None - indexes_with_embeddings = 0 - - for index_path in index_files: - try: - from codexlens.semantic.vector_store import VectorStore - - with VectorStore(index_path) as vs: - config = vs.get_model_config() - if config and config.get("model_profile"): - indexes_with_embeddings += 1 - - # Store first existing config as reference - if existing_config is None: - existing_config = { - "backend": config.get("backend"), - "model": config.get("model_profile"), - "model_name": config.get("model_name"), - "embedding_dim": config.get("embedding_dim"), - } - - # Check for conflict: different backend OR different model - existing_backend = config.get("backend", "") - existing_model = config.get("model_profile", "") - - if existing_backend != target_backend or existing_model != target_model: - conflicts.append({ - "path": str(index_path), - "existing": { - "backend": existing_backend, - "model": existing_model, - "model_name": config.get("model_name"), - }, - }) - except Exception as e: - logger.debug(f"Failed to check model config for {index_path}: {e}") - continue - - return { - "has_conflict": len(conflicts) > 0, - "existing_config": existing_config, - "target_config": {"backend": target_backend, "model": target_model}, - "conflicts": conflicts, - "indexes_with_embeddings": indexes_with_embeddings, - } - - -def _get_global_settings_path() -> Path: - """Get the path to global embedding settings file.""" - return Path.home() / ".codexlens" / "embedding_lock.json" - - -def get_locked_model_config() -> Optional[Dict[str, Any]]: - """Get the globally locked embedding model configuration. - - Returns: - Dictionary with backend and model if locked, None otherwise. - """ - settings_path = _get_global_settings_path() - if not settings_path.exists(): - return None - - try: - with open(settings_path, "r", encoding="utf-8") as f: - data = json.load(f) - if data.get("locked"): - return { - "backend": data.get("backend"), - "model": data.get("model"), - "locked_at": data.get("locked_at"), - } - except (json.JSONDecodeError, OSError): - pass - - return None - - -def set_locked_model_config(backend: str, model: str) -> None: - """Set the globally locked embedding model configuration. - - This is called after the first successful embedding generation - to lock the model for all future operations. - - Args: - backend: Embedding backend (fastembed or litellm) - model: Model profile/name - """ - import datetime - - settings_path = _get_global_settings_path() - settings_path.parent.mkdir(parents=True, exist_ok=True) - - data = { - "locked": True, - "backend": backend, - "model": model, - "locked_at": datetime.datetime.now().isoformat(), - } - - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2) - - -def clear_locked_model_config() -> bool: - """Clear the globally locked embedding model configuration. - - Returns: - True if lock was cleared, False if no lock existed. - """ - settings_path = _get_global_settings_path() - if settings_path.exists(): - settings_path.unlink() - return True - return False - - -def check_global_model_lock( - target_backend: str, - target_model: str, -) -> Dict[str, Any]: - """Check if the target model conflicts with the global lock. - - Args: - target_backend: Requested embedding backend - target_model: Requested model profile/name - - Returns: - Dictionary with: - - is_locked: True if a global lock exists - - has_conflict: True if target differs from locked config - - locked_config: The locked configuration (if any) - - target_config: The requested configuration - """ - locked_config = get_locked_model_config() - - if locked_config is None: - return { - "is_locked": False, - "has_conflict": False, - "locked_config": None, - "target_config": {"backend": target_backend, "model": target_model}, - } - - has_conflict = ( - locked_config["backend"] != target_backend or - locked_config["model"] != target_model - ) - - return { - "is_locked": True, - "has_conflict": has_conflict, - "locked_config": locked_config, - "target_config": {"backend": target_backend, "model": target_model}, - } \ No newline at end of file diff --git a/codex-lens/build/lib/codexlens/cli/model_manager.py b/codex-lens/build/lib/codexlens/cli/model_manager.py deleted file mode 100644 index 15776cf1..00000000 --- a/codex-lens/build/lib/codexlens/cli/model_manager.py +++ /dev/null @@ -1,1026 +0,0 @@ -"""Model Manager - Manage fastembed models for semantic search.""" - -import json -import os -import shutil -from pathlib import Path -from typing import Dict, List, Optional - -try: - from huggingface_hub import snapshot_download, list_repo_files - HUGGINGFACE_HUB_AVAILABLE = True -except ImportError: - HUGGINGFACE_HUB_AVAILABLE = False - -try: - from fastembed import TextEmbedding - FASTEMBED_AVAILABLE = True -except ImportError: - FASTEMBED_AVAILABLE = False - -try: - # fastembed >= 0.4.0 moved TextCrossEncoder to rerank.cross_encoder - from fastembed.rerank.cross_encoder import TextCrossEncoder - RERANKER_AVAILABLE = True -except ImportError: - try: - # Fallback for older versions - from fastembed import TextCrossEncoder - RERANKER_AVAILABLE = True - except ImportError: - RERANKER_AVAILABLE = False - - -# Reranker model profiles with metadata -# Note: fastembed TextCrossEncoder uses ONNX models from HuggingFace -RERANKER_MODEL_PROFILES = { - "ms-marco-mini": { - "model_name": "Xenova/ms-marco-MiniLM-L-6-v2", - "cache_name": "Xenova/ms-marco-MiniLM-L-6-v2", - "size_mb": 90, - "description": "Fast, lightweight reranker (default)", - "use_case": "Quick prototyping, resource-constrained environments", - "recommended": True, - }, - "ms-marco-12": { - "model_name": "Xenova/ms-marco-MiniLM-L-12-v2", - "cache_name": "Xenova/ms-marco-MiniLM-L-12-v2", - "size_mb": 130, - "description": "Better quality, 12-layer MiniLM", - "use_case": "General purpose reranking with better accuracy", - "recommended": True, - }, - "bge-base": { - "model_name": "BAAI/bge-reranker-base", - "cache_name": "BAAI/bge-reranker-base", - "size_mb": 280, - "description": "BGE reranker base model", - "use_case": "High-quality reranking for production", - "recommended": True, - }, - "bge-large": { - "model_name": "BAAI/bge-reranker-large", - "cache_name": "BAAI/bge-reranker-large", - "size_mb": 560, - "description": "BGE reranker large model (high resource usage)", - "use_case": "Maximum quality reranking", - "recommended": False, - }, - "jina-tiny": { - "model_name": "jinaai/jina-reranker-v1-tiny-en", - "cache_name": "jinaai/jina-reranker-v1-tiny-en", - "size_mb": 70, - "description": "Jina tiny reranker, very fast", - "use_case": "Ultra-low latency applications", - "recommended": True, - }, - "jina-turbo": { - "model_name": "jinaai/jina-reranker-v1-turbo-en", - "cache_name": "jinaai/jina-reranker-v1-turbo-en", - "size_mb": 150, - "description": "Jina turbo reranker, balanced", - "use_case": "Fast reranking with good accuracy", - "recommended": True, - }, - # Additional reranker models (commonly used) - "bge-reranker-v2-m3": { - "model_name": "BAAI/bge-reranker-v2-m3", - "cache_name": "BAAI/bge-reranker-v2-m3", - "size_mb": 560, - "description": "BGE v2 M3 reranker, multilingual", - "use_case": "Multilingual reranking, latest BGE version", - "recommended": True, - }, - "bge-reranker-v2-gemma": { - "model_name": "BAAI/bge-reranker-v2-gemma", - "cache_name": "BAAI/bge-reranker-v2-gemma", - "size_mb": 2000, - "description": "BGE v2 Gemma reranker, best quality", - "use_case": "Maximum quality with Gemma backbone", - "recommended": False, - }, - "cross-encoder-ms-marco": { - "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "cache_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "size_mb": 90, - "description": "Original cross-encoder MS MARCO", - "use_case": "Classic cross-encoder baseline", - "recommended": False, - }, -} - - -# Model profiles with metadata -# Note: 768d is max recommended dimension for optimal performance/quality balance -# 1024d models are available but not recommended due to higher resource usage -# cache_name: The actual Hugging Face repo name used by fastembed for ONNX caching -MODEL_PROFILES = { - "fast": { - "model_name": "BAAI/bge-small-en-v1.5", - "cache_name": "qdrant/bge-small-en-v1.5-onnx-q", # fastembed uses ONNX version - "dimensions": 384, - "size_mb": 80, - "description": "Fast, lightweight, English-optimized", - "use_case": "Quick prototyping, resource-constrained environments", - "recommended": True, - }, - "base": { - "model_name": "BAAI/bge-base-en-v1.5", - "cache_name": "qdrant/bge-base-en-v1.5-onnx-q", # fastembed uses ONNX version - "dimensions": 768, - "size_mb": 220, - "description": "General purpose, good balance of speed and quality", - "use_case": "General text search, documentation", - "recommended": True, - }, - "code": { - "model_name": "jinaai/jina-embeddings-v2-base-code", - "cache_name": "jinaai/jina-embeddings-v2-base-code", # Uses original name - "dimensions": 768, - "size_mb": 150, - "description": "Code-optimized, best for programming languages", - "use_case": "Open source projects, code semantic search", - "recommended": True, - }, - "minilm": { - "model_name": "sentence-transformers/all-MiniLM-L6-v2", - "cache_name": "qdrant/all-MiniLM-L6-v2-onnx", # fastembed uses ONNX version - "dimensions": 384, - "size_mb": 90, - "description": "Popular lightweight model, good quality", - "use_case": "General purpose, low resource environments", - "recommended": True, - }, - "multilingual": { - "model_name": "intfloat/multilingual-e5-large", - "cache_name": "qdrant/multilingual-e5-large-onnx", # fastembed uses ONNX version - "dimensions": 1024, - "size_mb": 1000, - "description": "Multilingual + code support (high resource usage)", - "use_case": "Enterprise multilingual projects", - "recommended": False, # 1024d not recommended - }, - "balanced": { - "model_name": "mixedbread-ai/mxbai-embed-large-v1", - "cache_name": "mixedbread-ai/mxbai-embed-large-v1", # Uses original name - "dimensions": 1024, - "size_mb": 600, - "description": "High accuracy, general purpose (high resource usage)", - "use_case": "High-quality semantic search, balanced performance", - "recommended": False, # 1024d not recommended - }, - # Additional embedding models (commonly used) - "bge-large": { - "model_name": "BAAI/bge-large-en-v1.5", - "cache_name": "qdrant/bge-large-en-v1.5-onnx-q", - "dimensions": 1024, - "size_mb": 650, - "description": "BGE large model, highest quality", - "use_case": "Maximum quality semantic search", - "recommended": False, - }, - "e5-small": { - "model_name": "intfloat/e5-small-v2", - "cache_name": "qdrant/e5-small-v2-onnx", - "dimensions": 384, - "size_mb": 80, - "description": "E5 small model, fast and lightweight", - "use_case": "Low latency applications", - "recommended": True, - }, - "e5-base": { - "model_name": "intfloat/e5-base-v2", - "cache_name": "qdrant/e5-base-v2-onnx", - "dimensions": 768, - "size_mb": 220, - "description": "E5 base model, balanced", - "use_case": "General purpose semantic search", - "recommended": True, - }, - "e5-large": { - "model_name": "intfloat/e5-large-v2", - "cache_name": "qdrant/e5-large-v2-onnx", - "dimensions": 1024, - "size_mb": 650, - "description": "E5 large model, high quality", - "use_case": "High quality semantic search", - "recommended": False, - }, - "jina-base-en": { - "model_name": "jinaai/jina-embeddings-v2-base-en", - "cache_name": "jinaai/jina-embeddings-v2-base-en", - "dimensions": 768, - "size_mb": 150, - "description": "Jina base English model", - "use_case": "English text semantic search", - "recommended": True, - }, - "jina-small-en": { - "model_name": "jinaai/jina-embeddings-v2-small-en", - "cache_name": "jinaai/jina-embeddings-v2-small-en", - "dimensions": 512, - "size_mb": 60, - "description": "Jina small English model, very fast", - "use_case": "Low latency English text search", - "recommended": True, - }, - "snowflake-arctic": { - "model_name": "Snowflake/snowflake-arctic-embed-m", - "cache_name": "Snowflake/snowflake-arctic-embed-m", - "dimensions": 768, - "size_mb": 220, - "description": "Snowflake Arctic embedding model", - "use_case": "Enterprise semantic search, high quality", - "recommended": True, - }, - "nomic-embed": { - "model_name": "nomic-ai/nomic-embed-text-v1.5", - "cache_name": "nomic-ai/nomic-embed-text-v1.5", - "dimensions": 768, - "size_mb": 280, - "description": "Nomic embedding model, open source", - "use_case": "Open source text embedding", - "recommended": True, - }, - "gte-small": { - "model_name": "thenlper/gte-small", - "cache_name": "thenlper/gte-small", - "dimensions": 384, - "size_mb": 70, - "description": "GTE small model, fast", - "use_case": "Fast text embedding", - "recommended": True, - }, - "gte-base": { - "model_name": "thenlper/gte-base", - "cache_name": "thenlper/gte-base", - "dimensions": 768, - "size_mb": 220, - "description": "GTE base model, balanced", - "use_case": "General purpose text embedding", - "recommended": True, - }, - "gte-large": { - "model_name": "thenlper/gte-large", - "cache_name": "thenlper/gte-large", - "dimensions": 1024, - "size_mb": 650, - "description": "GTE large model, high quality", - "use_case": "High quality text embedding", - "recommended": False, - }, -} - - -def get_cache_dir() -> Path: - """Get fastembed cache directory. - - Returns: - Path to cache directory (~/.cache/huggingface or custom path) - """ - # Check HF_HOME environment variable first - if "HF_HOME" in os.environ: - return Path(os.environ["HF_HOME"]) - - # fastembed 0.7.4+ uses HuggingFace cache when cache_dir is specified - # Models are stored directly under the cache directory - return Path.home() / ".cache" / "huggingface" - - -def _get_model_cache_path(cache_dir: Path, info: Dict) -> Path: - """Get the actual cache path for a model. - - fastembed 0.7.4+ uses HuggingFace Hub's naming convention: - - Prefix: 'models--' - - Replace '/' with '--' in model name - Example: jinaai/jina-embeddings-v2-base-code - -> models--jinaai--jina-embeddings-v2-base-code - - Args: - cache_dir: The fastembed cache directory (HuggingFace hub path) - info: Model profile info dictionary - - Returns: - Path to the model cache directory - """ - # HuggingFace Hub naming: models--{org}--{model} - # Use cache_name if available (for mapped ONNX models), else model_name - target_name = info.get("cache_name", info["model_name"]) - sanitized_name = f"models--{target_name.replace('/', '--')}" - return cache_dir / sanitized_name - - -def scan_discovered_models(model_type: str = "embedding") -> List[Dict]: - """Scan cache directory for manually placed models not in predefined profiles. - - This allows users to manually download models (e.g., via huggingface-cli or - by copying the model directory) and have them recognized automatically. - - Args: - model_type: Type of models to scan for ("embedding" or "reranker") - - Returns: - List of discovered model info dictionaries - """ - cache_dir = get_cache_dir() - if not cache_dir.exists(): - return [] - - # Get known model cache names based on type - if model_type == "reranker": - known_cache_names = { - f"models--{info.get('cache_name', info['model_name']).replace('/', '--')}" - for info in RERANKER_MODEL_PROFILES.values() - } - else: - known_cache_names = { - f"models--{info.get('cache_name', info['model_name']).replace('/', '--')}" - for info in MODEL_PROFILES.values() - } - - discovered = [] - - # Scan for model directories in cache - for item in cache_dir.iterdir(): - if not item.is_dir() or not item.name.startswith("models--"): - continue - - # Skip known predefined models - if item.name in known_cache_names: - continue - - # Parse model name from directory (models--org--model -> org/model) - parts = item.name[8:].split("--") # Remove "models--" prefix - if len(parts) >= 2: - model_name = "/".join(parts) - else: - model_name = parts[0] if parts else item.name - - # Detect model type by checking for common patterns - is_reranker = any(keyword in model_name.lower() for keyword in [ - "reranker", "cross-encoder", "ms-marco" - ]) - is_embedding = any(keyword in model_name.lower() for keyword in [ - "embed", "bge", "e5", "jina", "minilm", "gte", "nomic", "arctic" - ]) - - # Filter based on requested type - if model_type == "reranker" and not is_reranker: - continue - if model_type == "embedding" and is_reranker: - continue - - # Calculate cache size - try: - total_size = sum( - f.stat().st_size - for f in item.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - except (OSError, PermissionError): - cache_size_mb = 0 - - discovered.append({ - "profile": f"discovered:{model_name.replace('/', '-')}", - "model_name": model_name, - "cache_name": model_name, - "cache_path": str(item), - "actual_size_mb": cache_size_mb, - "description": f"Manually discovered model", - "use_case": "User-provided model", - "installed": True, - "source": "discovered", # Mark as discovered - }) - - return discovered - - -def list_models() -> Dict[str, any]: - """List available model profiles and their installation status. - - Returns: - Dictionary with model profiles, installed status, and cache info - """ - if not FASTEMBED_AVAILABLE: - return { - "success": False, - "error": "fastembed not installed. Install with: pip install codexlens[semantic]", - } - - cache_dir = get_cache_dir() - cache_exists = cache_dir.exists() - - models = [] - for profile, info in MODEL_PROFILES.items(): - model_name = info["model_name"] - - # Check if model is cached using the actual cache name - installed = False - cache_size_mb = 0 - - if cache_exists: - # Check for model directory in cache using correct cache_name - model_cache_path = _get_model_cache_path(cache_dir, info) - if model_cache_path.exists(): - installed = True - # Calculate cache size - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - models.append({ - "profile": profile, - "model_name": model_name, - "dimensions": info["dimensions"], - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb if installed else None, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "source": "predefined", # Mark as predefined - "recommended": info.get("recommended", True), - }) - - # Add discovered models (manually placed by user) - discovered = scan_discovered_models(model_type="embedding") - for model in discovered: - # Try to estimate dimensions based on common model patterns - dimensions = 768 # Default - name_lower = model["model_name"].lower() - if "small" in name_lower or "mini" in name_lower: - dimensions = 384 - elif "large" in name_lower: - dimensions = 1024 - - model["dimensions"] = dimensions - model["estimated_size_mb"] = model.get("actual_size_mb", 0) - model["recommended"] = False # User-provided models are not recommended by default - models.append(model) - - return { - "success": True, - "result": { - "models": models, - "cache_dir": str(cache_dir), - "cache_exists": cache_exists, - "manual_install_guide": { - "steps": [ - "1. Download: huggingface-cli download /", - "2. Or copy to cache directory (see paths below)", - "3. Refresh to see discovered models" - ], - "example": "huggingface-cli download BAAI/bge-small-en-v1.5", - "paths": { - "windows": "%USERPROFILE%\\.cache\\huggingface\\models----", - "linux": "~/.cache/huggingface/models----", - "macos": "~/.cache/huggingface/models----", - }, - }, - }, - } - - -def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a model by profile name. - - Args: - profile: Model profile name (fast, code, multilingual, balanced) - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not FASTEMBED_AVAILABLE: - return { - "success": False, - "error": "fastembed not installed. Install with: pip install codexlens[semantic]", - } - - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - - try: - # Get cache directory - cache_dir = get_cache_dir() - - # Download model by instantiating TextEmbedding with explicit cache_dir - # This ensures fastembed uses the correct HuggingFace Hub cache location - if progress_callback: - progress_callback(f"Downloading {model_name}...") - - # CRITICAL: Must specify cache_dir to use HuggingFace cache - # and call embed() to trigger actual download - embedder = TextEmbedding(model_name=model_name, cache_dir=str(cache_dir)) - - # Trigger actual download by calling embed - # TextEmbedding.__init__ alone doesn't download files - if progress_callback: - progress_callback(f"Initializing {model_name}...") - - list(embedder.embed(["test"])) # Trigger download - - if progress_callback: - progress_callback(f"Model {model_name} downloaded successfully") - - # Get cache info using correct HuggingFace Hub path - model_cache_path = _get_model_cache_path(cache_dir, info) - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download model: {str(e)}", - } - - -def download_custom_model(model_name: str, model_type: str = "embedding", progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a custom model by HuggingFace model name. - - This allows users to download any HuggingFace model directly from - HuggingFace Hub. The model will be placed in the standard cache - directory where it can be discovered by scan_discovered_models(). - - Note: Downloaded models may not be directly usable by FastEmbed unless - they are in ONNX format. This function is primarily for downloading - models that users want to use with other frameworks or custom code. - - Args: - model_name: Full HuggingFace model name (e.g., "intfloat/e5-small-v2") - model_type: Type of model ("embedding" or "reranker") - for metadata only - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not HUGGINGFACE_HUB_AVAILABLE: - return { - "success": False, - "error": "huggingface_hub not installed. Install with: pip install huggingface_hub", - } - - # Validate model name format (org/model-name) - if not model_name or "/" not in model_name: - return { - "success": False, - "error": "Invalid model name format. Expected: 'org/model-name' (e.g., 'intfloat/e5-small-v2')", - } - - try: - cache_dir = get_cache_dir() - - if progress_callback: - progress_callback(f"Checking model format for {model_name}...") - - # Check if model contains ONNX files before downloading - try: - files = list_repo_files(repo_id=model_name) - has_onnx = any( - f.endswith('.onnx') or - f.startswith('onnx/') or - '/onnx/' in f or - f == 'model.onnx' - for f in files - ) - - if not has_onnx: - return { - "success": False, - "error": f"Model '{model_name}' does not contain ONNX files. " - f"FastEmbed requires ONNX-format models. " - f"Try Xenova/* versions or check the recommended models list.", - "files_found": len(files), - "suggestion": "Use models from the 'Recommended Models' list, or search for ONNX versions (e.g., Xenova/*).", - } - - if progress_callback: - progress_callback(f"ONNX format detected. Downloading {model_name}...") - - except Exception as check_err: - # If we can't check, warn but allow download - if progress_callback: - progress_callback(f"Could not verify format, proceeding with download...") - - # Use huggingface_hub to download the model - # This downloads to the standard HuggingFace cache directory - local_path = snapshot_download( - repo_id=model_name, - cache_dir=str(cache_dir), - ) - - if progress_callback: - progress_callback(f"Model {model_name} downloaded successfully") - - # Get cache info - sanitized_name = f"models--{model_name.replace('/', '--')}" - model_cache_path = cache_dir / sanitized_name - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "model_name": model_name, - "model_type": model_type, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - "local_path": local_path, - "note": "Model downloaded. Note: Only ONNX-format models are compatible with FastEmbed.", - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download custom model: {str(e)}", - } - - -def delete_model(profile: str) -> Dict[str, any]: - """Delete a downloaded model from cache. - - Args: - profile: Model profile name to delete - - Returns: - Result dictionary with success status - """ - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - - if not model_cache_path.exists(): - return { - "success": False, - "error": f"Model {profile} ({model_name}) is not installed", - } - - try: - # Calculate size before deletion - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - size_mb = round(total_size / (1024 * 1024), 1) - - # Delete model directory - shutil.rmtree(model_cache_path) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "deleted_size_mb": size_mb, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to delete model: {str(e)}", - } - - -def get_model_info(profile: str) -> Dict[str, any]: - """Get detailed information about a model profile. - - Args: - profile: Model profile name - - Returns: - Result dictionary with model information - """ - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - - # Check installation status using correct cache_name - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - installed = model_cache_path.exists() - - cache_size_mb = None - if installed: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "dimensions": info["dimensions"], - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "cache_path": str(model_cache_path) if installed else None, - }, - } - - -# ============================================================================ -# Reranker Model Management Functions -# ============================================================================ - - -def list_reranker_models() -> Dict[str, any]: - """List available reranker model profiles and their installation status. - - Returns: - Dictionary with reranker model profiles, installed status, and cache info - """ - if not RERANKER_AVAILABLE: - return { - "success": False, - "error": "fastembed reranker not available. Install with: pip install fastembed>=0.4.0", - } - - cache_dir = get_cache_dir() - cache_exists = cache_dir.exists() - - models = [] - for profile, info in RERANKER_MODEL_PROFILES.items(): - model_name = info["model_name"] - - # Check if model is cached - installed = False - cache_size_mb = 0 - - if cache_exists: - model_cache_path = _get_model_cache_path(cache_dir, info) - if model_cache_path.exists(): - installed = True - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - models.append({ - "profile": profile, - "model_name": model_name, - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb if installed else None, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "recommended": info.get("recommended", True), - "source": "predefined", # Mark as predefined - }) - - # Add discovered reranker models (manually placed by user) - discovered = scan_discovered_models(model_type="reranker") - for model in discovered: - model["estimated_size_mb"] = model.get("actual_size_mb", 0) - model["recommended"] = False # User-provided models are not recommended by default - models.append(model) - - return { - "success": True, - "result": { - "models": models, - "cache_dir": str(cache_dir), - "cache_exists": cache_exists, - "manual_install_guide": { - "steps": [ - "1. Download: huggingface-cli download /", - "2. Or copy to cache directory (see paths below)", - "3. Refresh to see discovered models", - ], - "example": "huggingface-cli download BAAI/bge-reranker-base", - "paths": { - "windows": "%USERPROFILE%\\.cache\\huggingface\\models----", - "linux": "~/.cache/huggingface/models----", - "macos": "~/.cache/huggingface/models----", - }, - }, - }, - } - - -def download_reranker_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a reranker model by profile name. - - Args: - profile: Reranker model profile name - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not RERANKER_AVAILABLE: - return { - "success": False, - "error": "fastembed reranker not available. Install with: pip install fastembed>=0.4.0", - } - - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - - try: - cache_dir = get_cache_dir() - - if progress_callback: - progress_callback(f"Downloading reranker {model_name}...") - - # Download model by instantiating TextCrossEncoder with explicit cache_dir - reranker = TextCrossEncoder(model_name=model_name, cache_dir=str(cache_dir)) - - # Trigger actual download by calling rerank - if progress_callback: - progress_callback(f"Initializing {model_name}...") - - list(reranker.rerank("test query", ["test document"])) - - if progress_callback: - progress_callback(f"Reranker {model_name} downloaded successfully") - - # Get cache info - model_cache_path = _get_model_cache_path(cache_dir, info) - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download reranker model: {str(e)}", - } - - -def delete_reranker_model(profile: str) -> Dict[str, any]: - """Delete a downloaded reranker model from cache. - - Args: - profile: Reranker model profile name to delete - - Returns: - Result dictionary with success status - """ - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - - if not model_cache_path.exists(): - return { - "success": False, - "error": f"Reranker model {profile} ({model_name}) is not installed", - } - - try: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - size_mb = round(total_size / (1024 * 1024), 1) - - shutil.rmtree(model_cache_path) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "deleted_size_mb": size_mb, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to delete reranker model: {str(e)}", - } - - -def get_reranker_model_info(profile: str) -> Dict[str, any]: - """Get detailed information about a reranker model profile. - - Args: - profile: Reranker model profile name - - Returns: - Result dictionary with model information - """ - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - installed = model_cache_path.exists() - - cache_size_mb = None - if installed: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "recommended": info.get("recommended", True), - "cache_path": str(model_cache_path) if installed else None, - }, - } diff --git a/codex-lens/build/lib/codexlens/cli/output.py b/codex-lens/build/lib/codexlens/cli/output.py deleted file mode 100644 index 1abfb4d2..00000000 --- a/codex-lens/build/lib/codexlens/cli/output.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Rich and JSON output helpers for CodexLens CLI.""" - -from __future__ import annotations - -import json -import sys -from dataclasses import asdict, is_dataclass -from pathlib import Path -from typing import Any, Iterable, Mapping, Sequence - -from rich.console import Console -from rich.table import Table -from rich.text import Text - -from codexlens.entities import SearchResult, Symbol - -# Force UTF-8 encoding for Windows console to properly display Chinese text -# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues -console = Console(force_terminal=True, legacy_windows=False) - - -def _to_jsonable(value: Any) -> Any: - if value is None: - return None - if hasattr(value, "model_dump"): - return value.model_dump() - if is_dataclass(value): - return asdict(value) - if isinstance(value, Path): - return str(value) - if isinstance(value, Mapping): - return {k: _to_jsonable(v) for k, v in value.items()} - if isinstance(value, (list, tuple, set)): - return [_to_jsonable(v) for v in value] - return value - - -def print_json(*, success: bool, result: Any = None, error: str | None = None, **kwargs: Any) -> None: - """Print JSON output with optional additional fields. - - Args: - success: Whether the operation succeeded - result: Result data (used when success=True) - error: Error message (used when success=False) - **kwargs: Additional fields to include in the payload (e.g., code, details) - """ - payload: dict[str, Any] = {"success": success} - if success: - payload["result"] = _to_jsonable(result) - else: - payload["error"] = error or "Unknown error" - # Include additional error details if provided - for key, value in kwargs.items(): - payload[key] = _to_jsonable(value) - console.print_json(json.dumps(payload, ensure_ascii=False)) - - -def render_search_results( - results: Sequence[SearchResult], *, title: str = "Search Results", verbose: bool = False -) -> None: - """Render search results with optional source tags in verbose mode. - - Args: - results: Search results to display - title: Table title - verbose: If True, show search source tags ([E], [F], [V]) and fusion scores - """ - table = Table(title=title, show_lines=False) - - if verbose: - # Verbose mode: show source tags - table.add_column("Source", style="dim", width=6, justify="center") - - table.add_column("Path", style="cyan", no_wrap=True) - table.add_column("Score", style="magenta", justify="right") - table.add_column("Excerpt", style="white") - - for res in results: - excerpt = res.excerpt or "" - score_str = f"{res.score:.3f}" - - if verbose: - # Extract search source tag if available - source = getattr(res, "search_source", None) - source_tag = "" - if source == "exact": - source_tag = "[E]" - elif source == "fuzzy": - source_tag = "[F]" - elif source == "vector": - source_tag = "[V]" - elif source == "fusion": - source_tag = "[RRF]" - table.add_row(source_tag, res.path, score_str, excerpt) - else: - table.add_row(res.path, score_str, excerpt) - - console.print(table) - - -def render_symbols(symbols: Sequence[Symbol], *, title: str = "Symbols") -> None: - table = Table(title=title) - table.add_column("Name", style="green") - table.add_column("Kind", style="yellow") - table.add_column("Range", style="white", justify="right") - - for sym in symbols: - start, end = sym.range - table.add_row(sym.name, sym.kind, f"{start}-{end}") - - console.print(table) - - -def render_status(stats: Mapping[str, Any]) -> None: - table = Table(title="Index Status") - table.add_column("Metric", style="cyan") - table.add_column("Value", style="white") - - for key, value in stats.items(): - if isinstance(value, Mapping): - value_text = ", ".join(f"{k}:{v}" for k, v in value.items()) - elif isinstance(value, (list, tuple)): - value_text = ", ".join(str(v) for v in value) - else: - value_text = str(value) - table.add_row(str(key), value_text) - - console.print(table) - - -def render_file_inspect(path: str, language: str, symbols: Iterable[Symbol]) -> None: - header = Text.assemble(("File: ", "bold"), (path, "cyan"), (" Language: ", "bold"), (language, "green")) - console.print(header) - render_symbols(list(symbols), title="Discovered Symbols") - diff --git a/codex-lens/build/lib/codexlens/config.py b/codex-lens/build/lib/codexlens/config.py deleted file mode 100644 index 238e922d..00000000 --- a/codex-lens/build/lib/codexlens/config.py +++ /dev/null @@ -1,692 +0,0 @@ -"""Configuration system for CodexLens.""" - -from __future__ import annotations - -import json -import logging -import os -from dataclasses import dataclass, field -from functools import cached_property -from pathlib import Path -from typing import Any, Dict, List, Optional - -from .errors import ConfigError - - -# Workspace-local directory name -WORKSPACE_DIR_NAME = ".codexlens" - -# Settings file name -SETTINGS_FILE_NAME = "settings.json" - -# SPLADE index database name (centralized storage) -SPLADE_DB_NAME = "_splade.db" - -# Dense vector storage names (centralized storage) -VECTORS_HNSW_NAME = "_vectors.hnsw" -VECTORS_META_DB_NAME = "_vectors_meta.db" -BINARY_VECTORS_MMAP_NAME = "_binary_vectors.mmap" - -log = logging.getLogger(__name__) - - -def _default_global_dir() -> Path: - """Get global CodexLens data directory.""" - env_override = os.getenv("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - return (Path.home() / ".codexlens").resolve() - - -def find_workspace_root(start_path: Path) -> Optional[Path]: - """Find the workspace root by looking for .codexlens directory. - - Searches from start_path upward to find an existing .codexlens directory. - Returns None if not found. - """ - current = start_path.resolve() - - # Search up to filesystem root - while current != current.parent: - workspace_dir = current / WORKSPACE_DIR_NAME - if workspace_dir.is_dir(): - return current - current = current.parent - - # Check root as well - workspace_dir = current / WORKSPACE_DIR_NAME - if workspace_dir.is_dir(): - return current - - return None - - -@dataclass -class Config: - """Runtime configuration for CodexLens. - - - data_dir: Base directory for all persistent CodexLens data. - - venv_path: Optional virtualenv used for language tooling. - - supported_languages: Language IDs and their associated file extensions. - - parsing_rules: Per-language parsing and chunking hints. - """ - - data_dir: Path = field(default_factory=_default_global_dir) - venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv") - supported_languages: Dict[str, Dict[str, Any]] = field( - default_factory=lambda: { - # Source code languages (category: "code") - "python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"}, - "javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"}, - "typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"}, - "java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"}, - "go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"}, - "zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"}, - "objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"}, - "c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"}, - "cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"}, - "rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"}, - } - ) - parsing_rules: Dict[str, Dict[str, Any]] = field( - default_factory=lambda: { - "default": { - "max_chunk_chars": 4000, - "max_chunk_lines": 200, - "overlap_lines": 20, - } - } - ) - - llm_enabled: bool = False - llm_tool: str = "gemini" - llm_timeout_ms: int = 300000 - llm_batch_size: int = 5 - - # Hybrid chunker configuration - hybrid_max_chunk_size: int = 2000 # Max characters per chunk before LLM refinement - hybrid_llm_refinement: bool = False # Enable LLM-based semantic boundary refinement - - # Embedding configuration - embedding_backend: str = "fastembed" # "fastembed" (local) or "litellm" (API) - embedding_model: str = "code" # For fastembed: profile (fast/code/multilingual/balanced) - # For litellm: model name from config (e.g., "qwen3-embedding") - embedding_use_gpu: bool = True # For fastembed: whether to use GPU acceleration - - # SPLADE sparse retrieval configuration - enable_splade: bool = False # Disable SPLADE by default (slow ~360ms, use FTS instead) - splade_model: str = "naver/splade-cocondenser-ensembledistil" - splade_threshold: float = 0.01 # Min weight to store in index - splade_onnx_path: Optional[str] = None # Custom ONNX model path - - # FTS fallback (disabled by default, available via --use-fts) - use_fts_fallback: bool = True # Use FTS for sparse search (fast, SPLADE disabled) - - # Indexing/search optimizations - global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path - enable_merkle_detection: bool = True # Enable content-hash based incremental indexing - - # Graph expansion (search-time, uses precomputed neighbors) - enable_graph_expansion: bool = False - graph_expansion_depth: int = 2 - - # Optional search reranking (disabled by default) - enable_reranking: bool = False - reranking_top_k: int = 50 - symbol_boost_factor: float = 1.5 - - # Optional cross-encoder reranking (second stage; requires optional reranker deps) - enable_cross_encoder_rerank: bool = False - reranker_backend: str = "onnx" - reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" - reranker_top_k: int = 50 - reranker_max_input_tokens: int = 8192 # Maximum tokens for reranker API batching - reranker_chunk_type_weights: Optional[Dict[str, float]] = None # Weights for chunk types: {"code": 1.0, "docstring": 0.7} - reranker_test_file_penalty: float = 0.0 # Penalty for test files (0.0-1.0, e.g., 0.2 = 20% reduction) - - # Chunk stripping configuration (for semantic embedding) - chunk_strip_comments: bool = True # Strip comments from code chunks - chunk_strip_docstrings: bool = True # Strip docstrings from code chunks - - # Cascade search configuration (two-stage retrieval) - enable_cascade_search: bool = False # Enable cascade search (coarse + fine ranking) - cascade_coarse_k: int = 100 # Number of coarse candidates from first stage - cascade_fine_k: int = 10 # Number of final results after reranking - cascade_strategy: str = "binary" # "binary" (fast binary+dense) or "hybrid" (FTS+SPLADE+Vector+CrossEncoder) - - # Staged cascade search configuration (4-stage pipeline) - staged_coarse_k: int = 200 # Number of coarse candidates from Stage 1 binary search - staged_lsp_depth: int = 2 # LSP relationship expansion depth in Stage 2 - staged_clustering_strategy: str = "auto" # "auto", "hdbscan", "dbscan", "frequency", "noop" - staged_clustering_min_size: int = 3 # Minimum cluster size for Stage 3 grouping - enable_staged_rerank: bool = True # Enable optional cross-encoder reranking in Stage 4 - - # RRF fusion configuration - fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion) - rrf_k: int = 60 # RRF constant (default 60) - - # Category-based filtering to separate code/doc results - enable_category_filter: bool = True # Enable code/doc result separation - - # Multi-endpoint configuration for litellm backend - embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list) - # List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}] - embedding_pool_enabled: bool = False # Enable high availability pool for embeddings - embedding_strategy: str = "latency_aware" # round_robin, latency_aware, weighted_random - embedding_cooldown: float = 60.0 # Default cooldown seconds for rate-limited endpoints - - # Reranker multi-endpoint configuration - reranker_pool_enabled: bool = False # Enable high availability pool for reranker - reranker_strategy: str = "latency_aware" # round_robin, latency_aware, weighted_random - reranker_cooldown: float = 60.0 # Default cooldown seconds for rate-limited endpoints - - # API concurrency settings - api_max_workers: int = 4 # Max concurrent API calls for embedding/reranking - api_batch_size: int = 8 # Batch size for API requests - api_batch_size_dynamic: bool = False # Enable dynamic batch size calculation - api_batch_size_utilization_factor: float = 0.8 # Use 80% of model token capacity - api_batch_size_max: int = 2048 # Absolute upper limit for batch size - chars_per_token_estimate: int = 4 # Characters per token estimation ratio - - def __post_init__(self) -> None: - try: - self.data_dir = self.data_dir.expanduser().resolve() - self.venv_path = self.venv_path.expanduser().resolve() - self.data_dir.mkdir(parents=True, exist_ok=True) - except PermissionError as exc: - raise ConfigError( - f"Permission denied initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - except OSError as exc: - raise ConfigError( - f"Filesystem error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - except Exception as exc: - raise ConfigError( - f"Unexpected error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - - @cached_property - def cache_dir(self) -> Path: - """Directory for transient caches.""" - return self.data_dir / "cache" - - @cached_property - def index_dir(self) -> Path: - """Directory where index artifacts are stored.""" - return self.data_dir / "index" - - @cached_property - def db_path(self) -> Path: - """Default SQLite index path.""" - return self.index_dir / "codexlens.db" - - def ensure_runtime_dirs(self) -> None: - """Create standard runtime directories if missing.""" - for directory in (self.cache_dir, self.index_dir): - try: - directory.mkdir(parents=True, exist_ok=True) - except PermissionError as exc: - raise ConfigError( - f"Permission denied creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - except OSError as exc: - raise ConfigError( - f"Filesystem error creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - except Exception as exc: - raise ConfigError( - f"Unexpected error creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - - def language_for_path(self, path: str | Path) -> str | None: - """Infer a supported language ID from a file path.""" - extension = Path(path).suffix.lower() - for language_id, spec in self.supported_languages.items(): - extensions: List[str] = spec.get("extensions", []) - if extension in extensions: - return language_id - return None - - def category_for_path(self, path: str | Path) -> str | None: - """Get file category ('code' or 'doc') from a file path.""" - language = self.language_for_path(path) - if language is None: - return None - spec = self.supported_languages.get(language, {}) - return spec.get("category") - - def rules_for_language(self, language_id: str) -> Dict[str, Any]: - """Get parsing rules for a specific language, falling back to defaults.""" - return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})} - - @cached_property - def settings_path(self) -> Path: - """Path to the settings file.""" - return self.data_dir / SETTINGS_FILE_NAME - - def save_settings(self) -> None: - """Save embedding and other settings to file.""" - embedding_config = { - "backend": self.embedding_backend, - "model": self.embedding_model, - "use_gpu": self.embedding_use_gpu, - "pool_enabled": self.embedding_pool_enabled, - "strategy": self.embedding_strategy, - "cooldown": self.embedding_cooldown, - } - # Include multi-endpoint config if present - if self.embedding_endpoints: - embedding_config["endpoints"] = self.embedding_endpoints - - settings = { - "embedding": embedding_config, - "llm": { - "enabled": self.llm_enabled, - "tool": self.llm_tool, - "timeout_ms": self.llm_timeout_ms, - "batch_size": self.llm_batch_size, - }, - "reranker": { - "enabled": self.enable_cross_encoder_rerank, - "backend": self.reranker_backend, - "model": self.reranker_model, - "top_k": self.reranker_top_k, - "max_input_tokens": self.reranker_max_input_tokens, - "pool_enabled": self.reranker_pool_enabled, - "strategy": self.reranker_strategy, - "cooldown": self.reranker_cooldown, - }, - "cascade": { - "strategy": self.cascade_strategy, - "coarse_k": self.cascade_coarse_k, - "fine_k": self.cascade_fine_k, - }, - "api": { - "max_workers": self.api_max_workers, - "batch_size": self.api_batch_size, - "batch_size_dynamic": self.api_batch_size_dynamic, - "batch_size_utilization_factor": self.api_batch_size_utilization_factor, - "batch_size_max": self.api_batch_size_max, - "chars_per_token_estimate": self.chars_per_token_estimate, - }, - } - with open(self.settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f, indent=2) - - def load_settings(self) -> None: - """Load settings from file if exists.""" - if not self.settings_path.exists(): - return - - try: - with open(self.settings_path, "r", encoding="utf-8") as f: - settings = json.load(f) - - # Load embedding settings - embedding = settings.get("embedding", {}) - if "backend" in embedding: - backend = embedding["backend"] - # Support 'api' as alias for 'litellm' - if backend == "api": - backend = "litellm" - if backend in {"fastembed", "litellm"}: - self.embedding_backend = backend - else: - log.warning( - "Invalid embedding backend in %s: %r (expected 'fastembed' or 'litellm')", - self.settings_path, - embedding["backend"], - ) - if "model" in embedding: - self.embedding_model = embedding["model"] - if "use_gpu" in embedding: - self.embedding_use_gpu = embedding["use_gpu"] - - # Load multi-endpoint configuration - if "endpoints" in embedding: - self.embedding_endpoints = embedding["endpoints"] - if "pool_enabled" in embedding: - self.embedding_pool_enabled = embedding["pool_enabled"] - if "strategy" in embedding: - self.embedding_strategy = embedding["strategy"] - if "cooldown" in embedding: - self.embedding_cooldown = embedding["cooldown"] - - # Load LLM settings - llm = settings.get("llm", {}) - if "enabled" in llm: - self.llm_enabled = llm["enabled"] - if "tool" in llm: - self.llm_tool = llm["tool"] - if "timeout_ms" in llm: - self.llm_timeout_ms = llm["timeout_ms"] - if "batch_size" in llm: - self.llm_batch_size = llm["batch_size"] - - # Load reranker settings - reranker = settings.get("reranker", {}) - if "enabled" in reranker: - self.enable_cross_encoder_rerank = reranker["enabled"] - if "backend" in reranker: - backend = reranker["backend"] - if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}: - self.reranker_backend = backend - else: - log.warning( - "Invalid reranker backend in %s: %r (expected 'fastembed', 'onnx', 'api', 'litellm', or 'legacy')", - self.settings_path, - backend, - ) - if "model" in reranker: - self.reranker_model = reranker["model"] - if "top_k" in reranker: - self.reranker_top_k = reranker["top_k"] - if "max_input_tokens" in reranker: - self.reranker_max_input_tokens = reranker["max_input_tokens"] - if "pool_enabled" in reranker: - self.reranker_pool_enabled = reranker["pool_enabled"] - if "strategy" in reranker: - self.reranker_strategy = reranker["strategy"] - if "cooldown" in reranker: - self.reranker_cooldown = reranker["cooldown"] - - # Load cascade settings - cascade = settings.get("cascade", {}) - if "strategy" in cascade: - strategy = cascade["strategy"] - if strategy in {"binary", "hybrid", "binary_rerank", "dense_rerank"}: - self.cascade_strategy = strategy - else: - log.warning( - "Invalid cascade strategy in %s: %r (expected 'binary', 'hybrid', 'binary_rerank', or 'dense_rerank')", - self.settings_path, - strategy, - ) - if "coarse_k" in cascade: - self.cascade_coarse_k = cascade["coarse_k"] - if "fine_k" in cascade: - self.cascade_fine_k = cascade["fine_k"] - - # Load API settings - api = settings.get("api", {}) - if "max_workers" in api: - self.api_max_workers = api["max_workers"] - if "batch_size" in api: - self.api_batch_size = api["batch_size"] - if "batch_size_dynamic" in api: - self.api_batch_size_dynamic = api["batch_size_dynamic"] - if "batch_size_utilization_factor" in api: - self.api_batch_size_utilization_factor = api["batch_size_utilization_factor"] - if "batch_size_max" in api: - self.api_batch_size_max = api["batch_size_max"] - if "chars_per_token_estimate" in api: - self.chars_per_token_estimate = api["chars_per_token_estimate"] - except Exception as exc: - log.warning( - "Failed to load settings from %s (%s): %s", - self.settings_path, - type(exc).__name__, - exc, - ) - - # Apply .env overrides (highest priority) - self._apply_env_overrides() - - def _apply_env_overrides(self) -> None: - """Apply environment variable overrides from .env file. - - Priority: default → settings.json → .env (highest) - - Supported variables (with or without CODEXLENS_ prefix): - EMBEDDING_MODEL: Override embedding model/profile - EMBEDDING_BACKEND: Override embedding backend (fastembed/litellm) - EMBEDDING_POOL_ENABLED: Enable embedding high availability pool - EMBEDDING_STRATEGY: Load balance strategy for embedding - EMBEDDING_COOLDOWN: Rate limit cooldown for embedding - RERANKER_MODEL: Override reranker model - RERANKER_BACKEND: Override reranker backend - RERANKER_ENABLED: Override reranker enabled state (true/false) - RERANKER_POOL_ENABLED: Enable reranker high availability pool - RERANKER_STRATEGY: Load balance strategy for reranker - RERANKER_COOLDOWN: Rate limit cooldown for reranker - """ - from .env_config import load_global_env - - env_vars = load_global_env() - if not env_vars: - return - - def get_env(key: str) -> str | None: - """Get env var with or without CODEXLENS_ prefix.""" - # Check prefixed version first (Dashboard format), then unprefixed - return env_vars.get(f"CODEXLENS_{key}") or env_vars.get(key) - - # Embedding overrides - embedding_model = get_env("EMBEDDING_MODEL") - if embedding_model: - self.embedding_model = embedding_model - log.debug("Overriding embedding_model from .env: %s", self.embedding_model) - - embedding_backend = get_env("EMBEDDING_BACKEND") - if embedding_backend: - backend = embedding_backend.lower() - # Support 'api' as alias for 'litellm' - if backend == "api": - backend = "litellm" - if backend in {"fastembed", "litellm"}: - self.embedding_backend = backend - log.debug("Overriding embedding_backend from .env: %s", backend) - else: - log.warning("Invalid EMBEDDING_BACKEND in .env: %r", embedding_backend) - - embedding_pool = get_env("EMBEDDING_POOL_ENABLED") - if embedding_pool: - value = embedding_pool.lower() - self.embedding_pool_enabled = value in {"true", "1", "yes", "on"} - log.debug("Overriding embedding_pool_enabled from .env: %s", self.embedding_pool_enabled) - - embedding_strategy = get_env("EMBEDDING_STRATEGY") - if embedding_strategy: - strategy = embedding_strategy.lower() - if strategy in {"round_robin", "latency_aware", "weighted_random"}: - self.embedding_strategy = strategy - log.debug("Overriding embedding_strategy from .env: %s", strategy) - else: - log.warning("Invalid EMBEDDING_STRATEGY in .env: %r", embedding_strategy) - - embedding_cooldown = get_env("EMBEDDING_COOLDOWN") - if embedding_cooldown: - try: - self.embedding_cooldown = float(embedding_cooldown) - log.debug("Overriding embedding_cooldown from .env: %s", self.embedding_cooldown) - except ValueError: - log.warning("Invalid EMBEDDING_COOLDOWN in .env: %r", embedding_cooldown) - - # Reranker overrides - reranker_model = get_env("RERANKER_MODEL") - if reranker_model: - self.reranker_model = reranker_model - log.debug("Overriding reranker_model from .env: %s", self.reranker_model) - - reranker_backend = get_env("RERANKER_BACKEND") - if reranker_backend: - backend = reranker_backend.lower() - if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}: - self.reranker_backend = backend - log.debug("Overriding reranker_backend from .env: %s", backend) - else: - log.warning("Invalid RERANKER_BACKEND in .env: %r", reranker_backend) - - reranker_enabled = get_env("RERANKER_ENABLED") - if reranker_enabled: - value = reranker_enabled.lower() - self.enable_cross_encoder_rerank = value in {"true", "1", "yes", "on"} - log.debug("Overriding reranker_enabled from .env: %s", self.enable_cross_encoder_rerank) - - reranker_pool = get_env("RERANKER_POOL_ENABLED") - if reranker_pool: - value = reranker_pool.lower() - self.reranker_pool_enabled = value in {"true", "1", "yes", "on"} - log.debug("Overriding reranker_pool_enabled from .env: %s", self.reranker_pool_enabled) - - reranker_strategy = get_env("RERANKER_STRATEGY") - if reranker_strategy: - strategy = reranker_strategy.lower() - if strategy in {"round_robin", "latency_aware", "weighted_random"}: - self.reranker_strategy = strategy - log.debug("Overriding reranker_strategy from .env: %s", strategy) - else: - log.warning("Invalid RERANKER_STRATEGY in .env: %r", reranker_strategy) - - reranker_cooldown = get_env("RERANKER_COOLDOWN") - if reranker_cooldown: - try: - self.reranker_cooldown = float(reranker_cooldown) - log.debug("Overriding reranker_cooldown from .env: %s", self.reranker_cooldown) - except ValueError: - log.warning("Invalid RERANKER_COOLDOWN in .env: %r", reranker_cooldown) - - reranker_max_tokens = get_env("RERANKER_MAX_INPUT_TOKENS") - if reranker_max_tokens: - try: - self.reranker_max_input_tokens = int(reranker_max_tokens) - log.debug("Overriding reranker_max_input_tokens from .env: %s", self.reranker_max_input_tokens) - except ValueError: - log.warning("Invalid RERANKER_MAX_INPUT_TOKENS in .env: %r", reranker_max_tokens) - - # Reranker tuning from environment - test_penalty = get_env("RERANKER_TEST_FILE_PENALTY") - if test_penalty: - try: - self.reranker_test_file_penalty = float(test_penalty) - log.debug("Overriding reranker_test_file_penalty from .env: %s", self.reranker_test_file_penalty) - except ValueError: - log.warning("Invalid RERANKER_TEST_FILE_PENALTY in .env: %r", test_penalty) - - docstring_weight = get_env("RERANKER_DOCSTRING_WEIGHT") - if docstring_weight: - try: - weight = float(docstring_weight) - self.reranker_chunk_type_weights = {"code": 1.0, "docstring": weight} - log.debug("Overriding reranker docstring weight from .env: %s", weight) - except ValueError: - log.warning("Invalid RERANKER_DOCSTRING_WEIGHT in .env: %r", docstring_weight) - - # Chunk stripping from environment - strip_comments = get_env("CHUNK_STRIP_COMMENTS") - if strip_comments: - self.chunk_strip_comments = strip_comments.lower() in ("true", "1", "yes") - log.debug("Overriding chunk_strip_comments from .env: %s", self.chunk_strip_comments) - - strip_docstrings = get_env("CHUNK_STRIP_DOCSTRINGS") - if strip_docstrings: - self.chunk_strip_docstrings = strip_docstrings.lower() in ("true", "1", "yes") - log.debug("Overriding chunk_strip_docstrings from .env: %s", self.chunk_strip_docstrings) - - @classmethod - def load(cls) -> "Config": - """Load config with settings from file.""" - config = cls() - config.load_settings() - return config - - -@dataclass -class WorkspaceConfig: - """Workspace-local configuration for CodexLens. - - Stores index data in project/.codexlens/ directory. - """ - - workspace_root: Path - - def __post_init__(self) -> None: - self.workspace_root = Path(self.workspace_root).resolve() - - @property - def codexlens_dir(self) -> Path: - """The .codexlens directory in workspace root.""" - return self.workspace_root / WORKSPACE_DIR_NAME - - @property - def db_path(self) -> Path: - """SQLite index path for this workspace.""" - return self.codexlens_dir / "index.db" - - @property - def cache_dir(self) -> Path: - """Cache directory for this workspace.""" - return self.codexlens_dir / "cache" - - @property - def env_path(self) -> Path: - """Path to workspace .env file.""" - return self.codexlens_dir / ".env" - - def load_env(self, *, override: bool = False) -> int: - """Load .env file and apply to os.environ. - - Args: - override: If True, override existing environment variables - - Returns: - Number of variables applied - """ - from .env_config import apply_workspace_env - return apply_workspace_env(self.workspace_root, override=override) - - def get_api_config(self, prefix: str) -> dict: - """Get API configuration from environment. - - Args: - prefix: Environment variable prefix (e.g., "RERANKER", "EMBEDDING") - - Returns: - Dictionary with api_key, api_base, model, etc. - """ - from .env_config import get_api_config - return get_api_config(prefix, workspace_root=self.workspace_root) - - def initialize(self) -> None: - """Create the .codexlens directory structure.""" - try: - self.codexlens_dir.mkdir(parents=True, exist_ok=True) - self.cache_dir.mkdir(parents=True, exist_ok=True) - - # Create .gitignore to exclude cache but keep index - gitignore_path = self.codexlens_dir / ".gitignore" - if not gitignore_path.exists(): - gitignore_path.write_text( - "# CodexLens workspace data\n" - "cache/\n" - "*.log\n" - ".env\n" # Exclude .env from git - ) - except Exception as exc: - raise ConfigError(f"Failed to initialize workspace at {self.codexlens_dir}: {exc}") from exc - - def exists(self) -> bool: - """Check if workspace is already initialized.""" - return self.codexlens_dir.is_dir() and self.db_path.exists() - - @classmethod - def from_path(cls, path: Path) -> Optional["WorkspaceConfig"]: - """Create WorkspaceConfig from a path by finding workspace root. - - Returns None if no workspace found. - """ - root = find_workspace_root(path) - if root is None: - return None - return cls(workspace_root=root) - - @classmethod - def create_at(cls, path: Path) -> "WorkspaceConfig": - """Create a new workspace at the given path.""" - config = cls(workspace_root=path) - config.initialize() - return config diff --git a/codex-lens/build/lib/codexlens/entities.py b/codex-lens/build/lib/codexlens/entities.py deleted file mode 100644 index d569cc3e..00000000 --- a/codex-lens/build/lib/codexlens/entities.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Pydantic entity models for CodexLens.""" - -from __future__ import annotations - -import math -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple - -from pydantic import BaseModel, Field, field_validator - - -class Symbol(BaseModel): - """A code symbol discovered in a file.""" - - name: str = Field(..., min_length=1) - kind: str = Field(..., min_length=1) - range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive") - file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol") - - @field_validator("range") - @classmethod - def validate_range(cls, value: Tuple[int, int]) -> Tuple[int, int]: - if len(value) != 2: - raise ValueError("range must be a (start_line, end_line) tuple") - start_line, end_line = value - if start_line < 1 or end_line < 1: - raise ValueError("range lines must be >= 1") - if end_line < start_line: - raise ValueError("end_line must be >= start_line") - return value - - -class SemanticChunk(BaseModel): - """A semantically meaningful chunk of content, optionally embedded.""" - - content: str = Field(..., min_length=1) - embedding: Optional[List[float]] = Field(default=None, description="Vector embedding for semantic search") - metadata: Dict[str, Any] = Field(default_factory=dict) - id: Optional[int] = Field(default=None, description="Database row ID") - file_path: Optional[str] = Field(default=None, description="Source file path") - - @field_validator("embedding") - @classmethod - def validate_embedding(cls, value: Optional[List[float]]) -> Optional[List[float]]: - if value is None: - return value - if not value: - raise ValueError("embedding cannot be empty when provided") - norm = math.sqrt(sum(x * x for x in value)) - epsilon = 1e-10 - if norm < epsilon: - raise ValueError("embedding cannot be a zero vector") - return value - - -class IndexedFile(BaseModel): - """An indexed source file with symbols and optional semantic chunks.""" - - path: str = Field(..., min_length=1) - language: str = Field(..., min_length=1) - symbols: List[Symbol] = Field(default_factory=list) - chunks: List[SemanticChunk] = Field(default_factory=list) - relationships: List["CodeRelationship"] = Field(default_factory=list) - - @field_validator("path", "language") - @classmethod - def strip_and_validate_nonempty(cls, value: str) -> str: - cleaned = value.strip() - if not cleaned: - raise ValueError("value cannot be blank") - return cleaned - - -class RelationshipType(str, Enum): - """Types of code relationships.""" - CALL = "calls" - INHERITS = "inherits" - IMPORTS = "imports" - - -class CodeRelationship(BaseModel): - """A relationship between code symbols (e.g., function calls, inheritance).""" - - source_symbol: str = Field(..., min_length=1, description="Name of source symbol") - target_symbol: str = Field(..., min_length=1, description="Name of target symbol") - relationship_type: RelationshipType = Field(..., description="Type of relationship (call, inherits, etc.)") - source_file: str = Field(..., min_length=1, description="File path containing source symbol") - target_file: Optional[str] = Field(default=None, description="File path containing target (None if same file)") - source_line: int = Field(..., ge=1, description="Line number where relationship occurs (1-based)") - - -class AdditionalLocation(BaseModel): - """A pointer to another location where a similar result was found. - - Used for grouping search results with similar scores and content, - where the primary result is stored in SearchResult and secondary - locations are stored in this model. - """ - - path: str = Field(..., min_length=1) - score: float = Field(..., ge=0.0) - start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)") - end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)") - symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol") - - -class SearchResult(BaseModel): - """A unified search result for lexical or semantic search.""" - - path: str = Field(..., min_length=1) - score: float = Field(..., ge=0.0) - excerpt: Optional[str] = None - content: Optional[str] = Field(default=None, description="Full content of matched code block") - symbol: Optional[Symbol] = None - chunk: Optional[SemanticChunk] = None - metadata: Dict[str, Any] = Field(default_factory=dict) - - # Additional context for complete code blocks - start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)") - end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)") - symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class") - symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)") - - # Field for grouping similar results - additional_locations: List["AdditionalLocation"] = Field( - default_factory=list, - description="Other locations for grouped results with similar scores and content." - ) diff --git a/codex-lens/build/lib/codexlens/env_config.py b/codex-lens/build/lib/codexlens/env_config.py deleted file mode 100644 index 8f27065d..00000000 --- a/codex-lens/build/lib/codexlens/env_config.py +++ /dev/null @@ -1,304 +0,0 @@ -"""Environment configuration loader for CodexLens. - -Loads .env files from workspace .codexlens directory with fallback to project root. -Provides unified access to API configurations. - -Priority order: -1. Environment variables (already set) -2. .codexlens/.env (workspace-local) -3. .env (project root) -""" - -from __future__ import annotations - -import logging -import os -from pathlib import Path -from typing import Any, Dict, Optional - -log = logging.getLogger(__name__) - -# Supported environment variables with descriptions -ENV_VARS = { - # Reranker configuration (overrides settings.json) - "RERANKER_MODEL": "Reranker model name (overrides settings.json)", - "RERANKER_BACKEND": "Reranker backend: fastembed, onnx, api, litellm, legacy", - "RERANKER_ENABLED": "Enable reranker: true/false", - "RERANKER_API_KEY": "API key for reranker service (SiliconFlow/Cohere/Jina)", - "RERANKER_API_BASE": "Base URL for reranker API (overrides provider default)", - "RERANKER_PROVIDER": "Reranker provider: siliconflow, cohere, jina", - "RERANKER_POOL_ENABLED": "Enable reranker high availability pool: true/false", - "RERANKER_STRATEGY": "Reranker load balance strategy: round_robin, latency_aware, weighted_random", - "RERANKER_COOLDOWN": "Reranker rate limit cooldown in seconds", - # Embedding configuration (overrides settings.json) - "EMBEDDING_MODEL": "Embedding model/profile name (overrides settings.json)", - "EMBEDDING_BACKEND": "Embedding backend: fastembed, litellm", - "EMBEDDING_API_KEY": "API key for embedding service", - "EMBEDDING_API_BASE": "Base URL for embedding API", - "EMBEDDING_POOL_ENABLED": "Enable embedding high availability pool: true/false", - "EMBEDDING_STRATEGY": "Embedding load balance strategy: round_robin, latency_aware, weighted_random", - "EMBEDDING_COOLDOWN": "Embedding rate limit cooldown in seconds", - # LiteLLM configuration - "LITELLM_API_KEY": "API key for LiteLLM", - "LITELLM_API_BASE": "Base URL for LiteLLM", - "LITELLM_MODEL": "LiteLLM model name", - # General configuration - "CODEXLENS_DATA_DIR": "Custom data directory path", - "CODEXLENS_DEBUG": "Enable debug mode (true/false)", - # Chunking configuration - "CHUNK_STRIP_COMMENTS": "Strip comments from code chunks for embedding: true/false (default: true)", - "CHUNK_STRIP_DOCSTRINGS": "Strip docstrings from code chunks for embedding: true/false (default: true)", - # Reranker tuning - "RERANKER_TEST_FILE_PENALTY": "Penalty for test files in reranking: 0.0-1.0 (default: 0.0)", - "RERANKER_DOCSTRING_WEIGHT": "Weight for docstring chunks in reranking: 0.0-1.0 (default: 1.0)", -} - - -def _parse_env_line(line: str) -> tuple[str, str] | None: - """Parse a single .env line, returning (key, value) or None.""" - line = line.strip() - - # Skip empty lines and comments - if not line or line.startswith("#"): - return None - - # Handle export prefix - if line.startswith("export "): - line = line[7:].strip() - - # Split on first = - if "=" not in line: - return None - - key, _, value = line.partition("=") - key = key.strip() - value = value.strip() - - # Remove surrounding quotes - if len(value) >= 2: - if (value.startswith('"') and value.endswith('"')) or \ - (value.startswith("'") and value.endswith("'")): - value = value[1:-1] - - return key, value - - -def load_env_file(env_path: Path) -> Dict[str, str]: - """Load environment variables from a .env file. - - Args: - env_path: Path to .env file - - Returns: - Dictionary of environment variables - """ - if not env_path.is_file(): - return {} - - env_vars: Dict[str, str] = {} - - try: - content = env_path.read_text(encoding="utf-8") - for line in content.splitlines(): - result = _parse_env_line(line) - if result: - key, value = result - env_vars[key] = value - except Exception as exc: - log.warning("Failed to load .env file %s: %s", env_path, exc) - - return env_vars - - -def _get_global_data_dir() -> Path: - """Get global CodexLens data directory.""" - env_override = os.environ.get("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - return (Path.home() / ".codexlens").resolve() - - -def load_global_env() -> Dict[str, str]: - """Load environment variables from global ~/.codexlens/.env file. - - Returns: - Dictionary of environment variables from global config - """ - global_env_path = _get_global_data_dir() / ".env" - if global_env_path.is_file(): - env_vars = load_env_file(global_env_path) - log.debug("Loaded %d vars from global %s", len(env_vars), global_env_path) - return env_vars - return {} - - -def load_workspace_env(workspace_root: Path | None = None) -> Dict[str, str]: - """Load environment variables from workspace .env files. - - Priority (later overrides earlier): - 1. Global ~/.codexlens/.env (lowest priority) - 2. Project root .env - 3. .codexlens/.env (highest priority) - - Args: - workspace_root: Workspace root directory. If None, uses current directory. - - Returns: - Merged dictionary of environment variables - """ - if workspace_root is None: - workspace_root = Path.cwd() - - workspace_root = Path(workspace_root).resolve() - - env_vars: Dict[str, str] = {} - - # Load from global ~/.codexlens/.env (lowest priority) - global_vars = load_global_env() - if global_vars: - env_vars.update(global_vars) - - # Load from project root .env (medium priority) - root_env = workspace_root / ".env" - if root_env.is_file(): - loaded = load_env_file(root_env) - env_vars.update(loaded) - log.debug("Loaded %d vars from %s", len(loaded), root_env) - - # Load from .codexlens/.env (highest priority) - codexlens_env = workspace_root / ".codexlens" / ".env" - if codexlens_env.is_file(): - loaded = load_env_file(codexlens_env) - env_vars.update(loaded) - log.debug("Loaded %d vars from %s", len(loaded), codexlens_env) - - return env_vars - - -def apply_workspace_env(workspace_root: Path | None = None, *, override: bool = False) -> int: - """Load .env files and apply to os.environ. - - Args: - workspace_root: Workspace root directory - override: If True, override existing environment variables - - Returns: - Number of variables applied - """ - env_vars = load_workspace_env(workspace_root) - applied = 0 - - for key, value in env_vars.items(): - if override or key not in os.environ: - os.environ[key] = value - applied += 1 - log.debug("Applied env var: %s", key) - - return applied - - -def get_env(key: str, default: str | None = None, *, workspace_root: Path | None = None) -> str | None: - """Get environment variable with .env file fallback. - - Priority: - 1. os.environ (already set) - 2. .codexlens/.env - 3. .env - 4. default value - - Args: - key: Environment variable name - default: Default value if not found - workspace_root: Workspace root for .env file lookup - - Returns: - Value or default - """ - # Check os.environ first - if key in os.environ: - return os.environ[key] - - # Load from .env files - env_vars = load_workspace_env(workspace_root) - if key in env_vars: - return env_vars[key] - - return default - - -def get_api_config( - prefix: str, - *, - workspace_root: Path | None = None, - defaults: Dict[str, Any] | None = None, -) -> Dict[str, Any]: - """Get API configuration from environment. - - Loads {PREFIX}_API_KEY, {PREFIX}_API_BASE, {PREFIX}_MODEL, etc. - - Args: - prefix: Environment variable prefix (e.g., "RERANKER", "EMBEDDING") - workspace_root: Workspace root for .env file lookup - defaults: Default values - - Returns: - Dictionary with api_key, api_base, model, etc. - """ - defaults = defaults or {} - - config: Dict[str, Any] = {} - - # Standard API config fields - field_mapping = { - "api_key": f"{prefix}_API_KEY", - "api_base": f"{prefix}_API_BASE", - "model": f"{prefix}_MODEL", - "provider": f"{prefix}_PROVIDER", - "timeout": f"{prefix}_TIMEOUT", - } - - for field, env_key in field_mapping.items(): - value = get_env(env_key, workspace_root=workspace_root) - if value is not None: - # Type conversion for specific fields - if field == "timeout": - try: - config[field] = float(value) - except ValueError: - pass - else: - config[field] = value - elif field in defaults: - config[field] = defaults[field] - - return config - - -def generate_env_example() -> str: - """Generate .env.example content with all supported variables. - - Returns: - String content for .env.example file - """ - lines = [ - "# CodexLens Environment Configuration", - "# Copy this file to .codexlens/.env and fill in your values", - "", - ] - - # Group by prefix - groups: Dict[str, list] = {} - for key, desc in ENV_VARS.items(): - prefix = key.split("_")[0] - if prefix not in groups: - groups[prefix] = [] - groups[prefix].append((key, desc)) - - for prefix, items in groups.items(): - lines.append(f"# {prefix} Configuration") - for key, desc in items: - lines.append(f"# {desc}") - lines.append(f"# {key}=") - lines.append("") - - return "\n".join(lines) diff --git a/codex-lens/build/lib/codexlens/errors.py b/codex-lens/build/lib/codexlens/errors.py deleted file mode 100644 index cdaafa74..00000000 --- a/codex-lens/build/lib/codexlens/errors.py +++ /dev/null @@ -1,59 +0,0 @@ -"""CodexLens exception hierarchy.""" - -from __future__ import annotations - - -class CodexLensError(Exception): - """Base class for all CodexLens errors.""" - - -class ConfigError(CodexLensError): - """Raised when configuration is invalid or cannot be loaded.""" - - -class ParseError(CodexLensError): - """Raised when parsing or indexing a file fails.""" - - -class StorageError(CodexLensError): - """Raised when reading/writing index storage fails. - - Attributes: - message: Human-readable error description - db_path: Path to the database file (if applicable) - operation: The operation that failed (e.g., 'query', 'initialize', 'migrate') - details: Additional context for debugging - """ - - def __init__( - self, - message: str, - db_path: str | None = None, - operation: str | None = None, - details: dict | None = None - ) -> None: - super().__init__(message) - self.message = message - self.db_path = db_path - self.operation = operation - self.details = details or {} - - def __str__(self) -> str: - parts = [self.message] - if self.db_path: - parts.append(f"[db: {self.db_path}]") - if self.operation: - parts.append(f"[op: {self.operation}]") - if self.details: - detail_str = ", ".join(f"{k}={v}" for k, v in self.details.items()) - parts.append(f"[{detail_str}]") - return " ".join(parts) - - -class SearchError(CodexLensError): - """Raised when a search operation fails.""" - - -class IndexNotFoundError(CodexLensError): - """Raised when a project's index cannot be found.""" - diff --git a/codex-lens/build/lib/codexlens/hybrid_search/__init__.py b/codex-lens/build/lib/codexlens/hybrid_search/__init__.py deleted file mode 100644 index 03dd31b3..00000000 --- a/codex-lens/build/lib/codexlens/hybrid_search/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Hybrid Search data structures for CodexLens. - -This module provides core data structures for hybrid search: -- CodeSymbolNode: Graph node representing a code symbol -- CodeAssociationGraph: Graph of code relationships -- SearchResultCluster: Clustered search results -- Range: Position range in source files -- CallHierarchyItem: LSP call hierarchy item - -Note: The search engine is in codexlens.search.hybrid_search - LSP-based expansion is in codexlens.lsp module -""" - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, - SearchResultCluster, -) - -__all__ = [ - "CallHierarchyItem", - "CodeAssociationGraph", - "CodeSymbolNode", - "Range", - "SearchResultCluster", -] diff --git a/codex-lens/build/lib/codexlens/hybrid_search/data_structures.py b/codex-lens/build/lib/codexlens/hybrid_search/data_structures.py deleted file mode 100644 index 898971d0..00000000 --- a/codex-lens/build/lib/codexlens/hybrid_search/data_structures.py +++ /dev/null @@ -1,602 +0,0 @@ -"""Core data structures for the hybrid search system. - -This module defines the fundamental data structures used throughout the -hybrid search pipeline, including code symbol representations, association -graphs, and clustered search results. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING - -if TYPE_CHECKING: - import networkx as nx - - -@dataclass -class Range: - """Position range within a source file. - - Attributes: - start_line: Starting line number (0-based). - start_character: Starting character offset within the line. - end_line: Ending line number (0-based). - end_character: Ending character offset within the line. - """ - - start_line: int - start_character: int - end_line: int - end_character: int - - def __post_init__(self) -> None: - """Validate range values.""" - if self.start_line < 0: - raise ValueError("start_line must be >= 0") - if self.start_character < 0: - raise ValueError("start_character must be >= 0") - if self.end_line < 0: - raise ValueError("end_line must be >= 0") - if self.end_character < 0: - raise ValueError("end_character must be >= 0") - if self.end_line < self.start_line: - raise ValueError("end_line must be >= start_line") - if self.end_line == self.start_line and self.end_character < self.start_character: - raise ValueError("end_character must be >= start_character on the same line") - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "start": {"line": self.start_line, "character": self.start_character}, - "end": {"line": self.end_line, "character": self.end_character}, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> Range: - """Create Range from dictionary representation.""" - return cls( - start_line=data["start"]["line"], - start_character=data["start"]["character"], - end_line=data["end"]["line"], - end_character=data["end"]["character"], - ) - - @classmethod - def from_lsp_range(cls, lsp_range: Dict[str, Any]) -> Range: - """Create Range from LSP Range object. - - LSP Range format: - {"start": {"line": int, "character": int}, - "end": {"line": int, "character": int}} - """ - return cls( - start_line=lsp_range["start"]["line"], - start_character=lsp_range["start"]["character"], - end_line=lsp_range["end"]["line"], - end_character=lsp_range["end"]["character"], - ) - - -@dataclass -class CallHierarchyItem: - """LSP CallHierarchyItem for representing callers/callees. - - Attributes: - name: Symbol name (function, method, class name). - kind: Symbol kind (function, method, class, etc.). - file_path: Absolute file path where the symbol is defined. - range: Position range in the source file. - detail: Optional additional detail about the symbol. - """ - - name: str - kind: str - file_path: str - range: Range - detail: Optional[str] = None - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - } - if self.detail: - result["detail"] = self.detail - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from dictionary representation.""" - return cls( - name=data["name"], - kind=data["kind"], - file_path=data["file_path"], - range=Range.from_dict(data["range"]), - detail=data.get("detail"), - ) - - -@dataclass -class CodeSymbolNode: - """Graph node representing a code symbol. - - Attributes: - id: Unique identifier in format 'file_path:name:line'. - name: Symbol name (function, class, variable name). - kind: Symbol kind (function, class, method, variable, etc.). - file_path: Absolute file path where symbol is defined. - range: Start/end position in the source file. - embedding: Optional vector embedding for semantic search. - raw_code: Raw source code of the symbol. - docstring: Documentation string (if available). - score: Ranking score (used during reranking). - """ - - id: str - name: str - kind: str - file_path: str - range: Range - embedding: Optional[List[float]] = None - raw_code: str = "" - docstring: str = "" - score: float = 0.0 - - def __post_init__(self) -> None: - """Validate required fields.""" - if not self.id: - raise ValueError("id cannot be empty") - if not self.name: - raise ValueError("name cannot be empty") - if not self.kind: - raise ValueError("kind cannot be empty") - if not self.file_path: - raise ValueError("file_path cannot be empty") - - def __hash__(self) -> int: - """Hash based on unique ID.""" - return hash(self.id) - - def __eq__(self, other: object) -> bool: - """Equality based on unique ID.""" - if not isinstance(other, CodeSymbolNode): - return False - return self.id == other.id - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "id": self.id, - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - "score": self.score, - } - if self.raw_code: - result["raw_code"] = self.raw_code - if self.docstring: - result["docstring"] = self.docstring - # Exclude embedding from serialization (too large for JSON responses) - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> CodeSymbolNode: - """Create CodeSymbolNode from dictionary representation.""" - return cls( - id=data["id"], - name=data["name"], - kind=data["kind"], - file_path=data["file_path"], - range=Range.from_dict(data["range"]), - embedding=data.get("embedding"), - raw_code=data.get("raw_code", ""), - docstring=data.get("docstring", ""), - score=data.get("score", 0.0), - ) - - @classmethod - def from_lsp_location( - cls, - uri: str, - name: str, - kind: str, - lsp_range: Dict[str, Any], - raw_code: str = "", - docstring: str = "", - ) -> CodeSymbolNode: - """Create CodeSymbolNode from LSP location data. - - Args: - uri: File URI (file:// prefix will be stripped). - name: Symbol name. - kind: Symbol kind. - lsp_range: LSP Range object. - raw_code: Optional raw source code. - docstring: Optional documentation string. - - Returns: - New CodeSymbolNode instance. - """ - # Strip file:// prefix if present - file_path = uri - if file_path.startswith("file://"): - file_path = file_path[7:] - # Handle Windows paths (file:///C:/...) - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - range_obj = Range.from_lsp_range(lsp_range) - symbol_id = f"{file_path}:{name}:{range_obj.start_line}" - - return cls( - id=symbol_id, - name=name, - kind=kind, - file_path=file_path, - range=range_obj, - raw_code=raw_code, - docstring=docstring, - ) - - @classmethod - def create_id(cls, file_path: str, name: str, line: int) -> str: - """Generate a unique symbol ID. - - Args: - file_path: Absolute file path. - name: Symbol name. - line: Start line number. - - Returns: - Unique ID string in format 'file_path:name:line'. - """ - return f"{file_path}:{name}:{line}" - - -@dataclass -class CodeAssociationGraph: - """Graph of code relationships between symbols. - - This graph represents the association between code symbols discovered - through LSP queries (references, call hierarchy, etc.). - - Attributes: - nodes: Dictionary mapping symbol IDs to CodeSymbolNode objects. - edges: List of (from_id, to_id, relationship_type) tuples. - relationship_type: 'calls', 'references', 'inherits', 'imports'. - """ - - nodes: Dict[str, CodeSymbolNode] = field(default_factory=dict) - edges: List[Tuple[str, str, str]] = field(default_factory=list) - - def add_node(self, node: CodeSymbolNode) -> None: - """Add a node to the graph. - - Args: - node: CodeSymbolNode to add. If a node with the same ID exists, - it will be replaced. - """ - self.nodes[node.id] = node - - def add_edge(self, from_id: str, to_id: str, rel_type: str) -> None: - """Add an edge to the graph. - - Args: - from_id: Source node ID. - to_id: Target node ID. - rel_type: Relationship type ('calls', 'references', 'inherits', 'imports'). - - Raises: - ValueError: If from_id or to_id not in graph nodes. - """ - if from_id not in self.nodes: - raise ValueError(f"Source node '{from_id}' not found in graph") - if to_id not in self.nodes: - raise ValueError(f"Target node '{to_id}' not found in graph") - - edge = (from_id, to_id, rel_type) - if edge not in self.edges: - self.edges.append(edge) - - def add_edge_unchecked(self, from_id: str, to_id: str, rel_type: str) -> None: - """Add an edge without validating node existence. - - Use this method during bulk graph construction where nodes may be - added after edges, or when performance is critical. - - Args: - from_id: Source node ID. - to_id: Target node ID. - rel_type: Relationship type. - """ - edge = (from_id, to_id, rel_type) - if edge not in self.edges: - self.edges.append(edge) - - def get_node(self, node_id: str) -> Optional[CodeSymbolNode]: - """Get a node by ID. - - Args: - node_id: Node ID to look up. - - Returns: - CodeSymbolNode if found, None otherwise. - """ - return self.nodes.get(node_id) - - def get_neighbors(self, node_id: str, rel_type: Optional[str] = None) -> List[CodeSymbolNode]: - """Get neighboring nodes connected by outgoing edges. - - Args: - node_id: Node ID to find neighbors for. - rel_type: Optional filter by relationship type. - - Returns: - List of neighboring CodeSymbolNode objects. - """ - neighbors = [] - for from_id, to_id, edge_rel in self.edges: - if from_id == node_id: - if rel_type is None or edge_rel == rel_type: - node = self.nodes.get(to_id) - if node: - neighbors.append(node) - return neighbors - - def get_incoming(self, node_id: str, rel_type: Optional[str] = None) -> List[CodeSymbolNode]: - """Get nodes connected by incoming edges. - - Args: - node_id: Node ID to find incoming connections for. - rel_type: Optional filter by relationship type. - - Returns: - List of CodeSymbolNode objects with edges pointing to node_id. - """ - incoming = [] - for from_id, to_id, edge_rel in self.edges: - if to_id == node_id: - if rel_type is None or edge_rel == rel_type: - node = self.nodes.get(from_id) - if node: - incoming.append(node) - return incoming - - def to_networkx(self) -> "nx.DiGraph": - """Convert to NetworkX DiGraph for graph algorithms. - - Returns: - NetworkX directed graph with nodes and edges. - - Raises: - ImportError: If networkx is not installed. - """ - try: - import networkx as nx - except ImportError: - raise ImportError( - "networkx is required for graph algorithms. " - "Install with: pip install networkx" - ) - - graph = nx.DiGraph() - - # Add nodes with attributes - for node_id, node in self.nodes.items(): - graph.add_node( - node_id, - name=node.name, - kind=node.kind, - file_path=node.file_path, - score=node.score, - ) - - # Add edges with relationship type - for from_id, to_id, rel_type in self.edges: - graph.add_edge(from_id, to_id, relationship=rel_type) - - return graph - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization. - - Returns: - Dictionary with 'nodes' and 'edges' keys. - """ - return { - "nodes": {node_id: node.to_dict() for node_id, node in self.nodes.items()}, - "edges": [ - {"from": from_id, "to": to_id, "relationship": rel_type} - for from_id, to_id, rel_type in self.edges - ], - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> CodeAssociationGraph: - """Create CodeAssociationGraph from dictionary representation. - - Args: - data: Dictionary with 'nodes' and 'edges' keys. - - Returns: - New CodeAssociationGraph instance. - """ - graph = cls() - - # Load nodes - for node_id, node_data in data.get("nodes", {}).items(): - graph.nodes[node_id] = CodeSymbolNode.from_dict(node_data) - - # Load edges - for edge_data in data.get("edges", []): - graph.edges.append(( - edge_data["from"], - edge_data["to"], - edge_data["relationship"], - )) - - return graph - - def __len__(self) -> int: - """Return the number of nodes in the graph.""" - return len(self.nodes) - - -@dataclass -class SearchResultCluster: - """Clustered search result containing related code symbols. - - Search results are grouped into clusters based on graph community - detection or embedding similarity. Each cluster represents a - conceptually related group of code symbols. - - Attributes: - cluster_id: Unique cluster identifier. - score: Cluster relevance score (max of symbol scores). - title: Human-readable cluster title/summary. - symbols: List of CodeSymbolNode in this cluster. - metadata: Additional cluster metadata. - """ - - cluster_id: str - score: float - title: str - symbols: List[CodeSymbolNode] = field(default_factory=list) - metadata: Dict[str, Any] = field(default_factory=dict) - - def __post_init__(self) -> None: - """Validate cluster fields.""" - if not self.cluster_id: - raise ValueError("cluster_id cannot be empty") - if self.score < 0: - raise ValueError("score must be >= 0") - - def add_symbol(self, symbol: CodeSymbolNode) -> None: - """Add a symbol to the cluster. - - Args: - symbol: CodeSymbolNode to add. - """ - self.symbols.append(symbol) - - def get_top_symbols(self, n: int = 5) -> List[CodeSymbolNode]: - """Get top N symbols by score. - - Args: - n: Number of symbols to return. - - Returns: - List of top N CodeSymbolNode objects sorted by score descending. - """ - sorted_symbols = sorted(self.symbols, key=lambda s: s.score, reverse=True) - return sorted_symbols[:n] - - def update_score(self) -> None: - """Update cluster score to max of symbol scores.""" - if self.symbols: - self.score = max(s.score for s in self.symbols) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization. - - Returns: - Dictionary representation of the cluster. - """ - return { - "cluster_id": self.cluster_id, - "score": self.score, - "title": self.title, - "symbols": [s.to_dict() for s in self.symbols], - "metadata": self.metadata, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> SearchResultCluster: - """Create SearchResultCluster from dictionary representation. - - Args: - data: Dictionary with cluster data. - - Returns: - New SearchResultCluster instance. - """ - return cls( - cluster_id=data["cluster_id"], - score=data["score"], - title=data["title"], - symbols=[CodeSymbolNode.from_dict(s) for s in data.get("symbols", [])], - metadata=data.get("metadata", {}), - ) - - def __len__(self) -> int: - """Return the number of symbols in the cluster.""" - return len(self.symbols) - - -@dataclass -class CallHierarchyItem: - """LSP CallHierarchyItem for representing callers/callees. - - Attributes: - name: Symbol name (function, method, etc.). - kind: Symbol kind (function, method, etc.). - file_path: Absolute file path. - range: Position range in the file. - detail: Optional additional detail (e.g., signature). - """ - - name: str - kind: str - file_path: str - range: Range - detail: Optional[str] = None - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - } - if self.detail: - result["detail"] = self.detail - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from dictionary representation.""" - return cls( - name=data.get("name", "unknown"), - kind=data.get("kind", "unknown"), - file_path=data.get("file_path", data.get("uri", "")), - range=Range.from_dict(data.get("range", {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}})), - detail=data.get("detail"), - ) - - @classmethod - def from_lsp(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from LSP response format. - - LSP uses 0-based line numbers and 'character' instead of 'char'. - """ - uri = data.get("uri", data.get("file_path", "")) - # Strip file:// prefix - file_path = uri - if file_path.startswith("file://"): - file_path = file_path[7:] - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - return cls( - name=data.get("name", "unknown"), - kind=str(data.get("kind", "unknown")), - file_path=file_path, - range=Range.from_lsp_range(data.get("range", {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}})), - detail=data.get("detail"), - ) diff --git a/codex-lens/build/lib/codexlens/indexing/__init__.py b/codex-lens/build/lib/codexlens/indexing/__init__.py deleted file mode 100644 index 1136099f..00000000 --- a/codex-lens/build/lib/codexlens/indexing/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Code indexing and symbol extraction.""" -from codexlens.indexing.symbol_extractor import SymbolExtractor -from codexlens.indexing.embedding import ( - BinaryEmbeddingBackend, - DenseEmbeddingBackend, - CascadeEmbeddingBackend, - get_cascade_embedder, - binarize_embedding, - pack_binary_embedding, - unpack_binary_embedding, - hamming_distance, -) - -__all__ = [ - "SymbolExtractor", - # Cascade embedding backends - "BinaryEmbeddingBackend", - "DenseEmbeddingBackend", - "CascadeEmbeddingBackend", - "get_cascade_embedder", - # Utility functions - "binarize_embedding", - "pack_binary_embedding", - "unpack_binary_embedding", - "hamming_distance", -] diff --git a/codex-lens/build/lib/codexlens/indexing/embedding.py b/codex-lens/build/lib/codexlens/indexing/embedding.py deleted file mode 100644 index 4175f3e5..00000000 --- a/codex-lens/build/lib/codexlens/indexing/embedding.py +++ /dev/null @@ -1,582 +0,0 @@ -"""Multi-type embedding backends for cascade retrieval. - -This module provides embedding backends optimized for cascade retrieval: -1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors -2. DenseEmbeddingBackend - High-precision dense vectors for reranking -3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval - -Cascade retrieval workflow: -1. Binary search (fast, ~32 bytes/vector) -> top-K candidates -2. Dense rerank (precise, ~8KB/vector) -> final results -""" - -from __future__ import annotations - -import logging -from typing import Iterable, List, Optional, Tuple - -import numpy as np - -from codexlens.semantic.base import BaseEmbedder - -logger = logging.getLogger(__name__) - - -# ============================================================================= -# Utility Functions -# ============================================================================= - - -def binarize_embedding(embedding: np.ndarray) -> np.ndarray: - """Convert float embedding to binary vector. - - Applies sign-based quantization: values > 0 become 1, values <= 0 become 0. - - Args: - embedding: Float32 embedding of any dimension - - Returns: - Binary vector (uint8 with values 0 or 1) of same dimension - """ - return (embedding > 0).astype(np.uint8) - - -def pack_binary_embedding(binary_vector: np.ndarray) -> bytes: - """Pack binary vector into compact bytes format. - - Packs 8 binary values into each byte for storage efficiency. - For a 256-dim binary vector, output is 32 bytes. - - Args: - binary_vector: Binary vector (uint8 with values 0 or 1) - - Returns: - Packed bytes (length = ceil(dim / 8)) - """ - # Ensure vector length is multiple of 8 by padding if needed - dim = len(binary_vector) - padded_dim = ((dim + 7) // 8) * 8 - if padded_dim > dim: - padded = np.zeros(padded_dim, dtype=np.uint8) - padded[:dim] = binary_vector - binary_vector = padded - - # Pack 8 bits per byte - packed = np.packbits(binary_vector) - return packed.tobytes() - - -def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray: - """Unpack bytes back to binary vector. - - Args: - packed_bytes: Packed binary data - dim: Original vector dimension (default: 256) - - Returns: - Binary vector (uint8 with values 0 or 1) - """ - unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8)) - return unpacked[:dim] - - -def hamming_distance(a: bytes, b: bytes) -> int: - """Compute Hamming distance between two packed binary vectors. - - Uses XOR and popcount for efficient distance computation. - - Args: - a: First packed binary vector - b: Second packed binary vector - - Returns: - Hamming distance (number of differing bits) - """ - a_arr = np.frombuffer(a, dtype=np.uint8) - b_arr = np.frombuffer(b, dtype=np.uint8) - xor = np.bitwise_xor(a_arr, b_arr) - return int(np.unpackbits(xor).sum()) - - -# ============================================================================= -# Binary Embedding Backend -# ============================================================================= - - -class BinaryEmbeddingBackend(BaseEmbedder): - """Generate 256-dimensional binary embeddings for fast coarse retrieval. - - Uses a lightweight embedding model and applies sign-based quantization - to produce compact binary vectors (32 bytes per embedding). - - Suitable for: - - First-stage candidate retrieval - - Hamming distance-based similarity search - - Memory-constrained environments - - Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits - """ - - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, fast - BINARY_DIM = 256 - - def __init__( - self, - model_name: Optional[str] = None, - use_gpu: bool = True, - ) -> None: - """Initialize binary embedding backend. - - Args: - model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5 - use_gpu: Whether to use GPU acceleration - """ - from codexlens.semantic import SEMANTIC_AVAILABLE - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self._model_name = model_name or self.DEFAULT_MODEL - self._use_gpu = use_gpu - self._model = None - - # Projection matrix for dimension reduction (lazily initialized) - self._projection_matrix: Optional[np.ndarray] = None - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Return binary embedding dimension (256).""" - return self.BINARY_DIM - - @property - def packed_bytes(self) -> int: - """Return packed bytes size (32 bytes for 256 bits).""" - return self.BINARY_DIM // 8 - - def _load_model(self) -> None: - """Lazy load the embedding model.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - from codexlens.semantic.gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True) - try: - self._model = TextEmbedding( - model_name=self._model_name, - providers=providers, - ) - except TypeError: - # Fallback for older fastembed versions - self._model = TextEmbedding(model_name=self._model_name) - - logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}") - - def _get_projection_matrix(self, input_dim: int) -> np.ndarray: - """Get or create projection matrix for dimension reduction. - - Uses random projection with fixed seed for reproducibility. - - Args: - input_dim: Input embedding dimension from base model - - Returns: - Projection matrix of shape (input_dim, BINARY_DIM) - """ - if self._projection_matrix is not None: - return self._projection_matrix - - # Fixed seed for reproducibility across sessions - rng = np.random.RandomState(42) - # Gaussian random projection - self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32) - # Normalize columns for consistent scale - norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True) - self._projection_matrix /= (norms + 1e-8) - - return self._projection_matrix - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate binary embeddings as numpy array. - - Args: - texts: Single text or iterable of texts - - Returns: - Binary embeddings of shape (n_texts, 256) with values 0 or 1 - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Get base float embeddings - float_embeddings = np.array(list(self._model.embed(texts))) - input_dim = float_embeddings.shape[1] - - # Project to target dimension if needed - if input_dim != self.BINARY_DIM: - projection = self._get_projection_matrix(input_dim) - float_embeddings = float_embeddings @ projection - - # Binarize - return binarize_embedding(float_embeddings) - - def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]: - """Generate packed binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - List of packed bytes (32 bytes each for 256-dim) - """ - binary = self.embed_to_numpy(texts) - return [pack_binary_embedding(vec) for vec in binary] - - -# ============================================================================= -# Dense Embedding Backend -# ============================================================================= - - -class DenseEmbeddingBackend(BaseEmbedder): - """Generate high-dimensional dense embeddings for precise reranking. - - Uses large embedding models to produce 2048-dimensional float32 vectors - for maximum retrieval quality. - - Suitable for: - - Second-stage reranking - - High-precision similarity search - - Quality-critical applications - - Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion - """ - - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing - TARGET_DIM = 768 # Reduced target for faster testing - - def __init__( - self, - model_name: Optional[str] = None, - use_gpu: bool = True, - expand_dim: bool = True, - ) -> None: - """Initialize dense embedding backend. - - Args: - model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5 - use_gpu: Whether to use GPU acceleration - expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion - """ - from codexlens.semantic import SEMANTIC_AVAILABLE - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self._model_name = model_name or self.DEFAULT_MODEL - self._use_gpu = use_gpu - self._expand_dim = expand_dim - self._model = None - self._native_dim: Optional[int] = None - - # Expansion matrix for dimension expansion (lazily initialized) - self._expansion_matrix: Optional[np.ndarray] = None - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Return embedding dimension. - - Returns TARGET_DIM if expand_dim is True, otherwise native model dimension. - """ - if self._expand_dim: - return self.TARGET_DIM - # Return cached native dim or estimate based on model - if self._native_dim is not None: - return self._native_dim - # Model dimension estimates - model_dims = { - "BAAI/bge-large-en-v1.5": 1024, - "BAAI/bge-base-en-v1.5": 768, - "BAAI/bge-small-en-v1.5": 384, - "intfloat/multilingual-e5-large": 1024, - } - return model_dims.get(self._model_name, 1024) - - @property - def max_tokens(self) -> int: - """Return maximum token limit.""" - return 512 # Conservative default for large models - - def _load_model(self) -> None: - """Lazy load the embedding model.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - from codexlens.semantic.gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True) - try: - self._model = TextEmbedding( - model_name=self._model_name, - providers=providers, - ) - except TypeError: - self._model = TextEmbedding(model_name=self._model_name) - - logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}") - - def _get_expansion_matrix(self, input_dim: int) -> np.ndarray: - """Get or create expansion matrix for dimension expansion. - - Uses random orthogonal projection for information-preserving expansion. - - Args: - input_dim: Input embedding dimension from base model - - Returns: - Expansion matrix of shape (input_dim, TARGET_DIM) - """ - if self._expansion_matrix is not None: - return self._expansion_matrix - - # Fixed seed for reproducibility - rng = np.random.RandomState(123) - - # Create semi-orthogonal expansion matrix - # First input_dim columns form identity-like structure - self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32) - - # Copy original dimensions - copy_dim = min(input_dim, self.TARGET_DIM) - self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32) - - # Fill remaining with random projections - if self.TARGET_DIM > input_dim: - random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32) - # Normalize - norms = np.linalg.norm(random_part, axis=0, keepdims=True) - random_part /= (norms + 1e-8) - self._expansion_matrix[:, input_dim:] = random_part - - return self._expansion_matrix - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate dense embeddings as numpy array. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, TARGET_DIM) as float32 - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Get base float embeddings - float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32) - self._native_dim = float_embeddings.shape[1] - - # Expand to target dimension if needed - if self._expand_dim and self._native_dim < self.TARGET_DIM: - expansion = self._get_expansion_matrix(self._native_dim) - float_embeddings = float_embeddings @ expansion - - return float_embeddings - - -# ============================================================================= -# Cascade Embedding Backend -# ============================================================================= - - -class CascadeEmbeddingBackend(BaseEmbedder): - """Combined binary + dense embedding backend for cascade retrieval. - - Generates both binary (for fast coarse filtering) and dense (for precise - reranking) embeddings in a single pass, optimized for two-stage retrieval. - - Cascade workflow: - 1. encode_cascade() returns (binary_embeddings, dense_embeddings) - 2. Binary search: Use Hamming distance on binary vectors -> top-K candidates - 3. Dense rerank: Use cosine similarity on dense vectors -> final results - - Memory efficiency: - - Binary: 32 bytes per vector (256 bits) - - Dense: 8192 bytes per vector (2048 x float32) - - Total: ~8KB per document for full cascade support - """ - - def __init__( - self, - binary_model: Optional[str] = None, - dense_model: Optional[str] = None, - use_gpu: bool = True, - ) -> None: - """Initialize cascade embedding backend. - - Args: - binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5 - dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5 - use_gpu: Whether to use GPU acceleration - """ - self._binary_backend = BinaryEmbeddingBackend( - model_name=binary_model, - use_gpu=use_gpu, - ) - self._dense_backend = DenseEmbeddingBackend( - model_name=dense_model, - use_gpu=use_gpu, - expand_dim=True, - ) - self._use_gpu = use_gpu - - @property - def model_name(self) -> str: - """Return model names for both backends.""" - return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})" - - @property - def embedding_dim(self) -> int: - """Return dense embedding dimension (for compatibility).""" - return self._dense_backend.embedding_dim - - @property - def binary_dim(self) -> int: - """Return binary embedding dimension.""" - return self._binary_backend.embedding_dim - - @property - def dense_dim(self) -> int: - """Return dense embedding dimension.""" - return self._dense_backend.embedding_dim - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate dense embeddings (for BaseEmbedder compatibility). - - For cascade embeddings, use encode_cascade() instead. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, dense_dim) - """ - return self._dense_backend.embed_to_numpy(texts) - - def encode_cascade( - self, - texts: str | Iterable[str], - batch_size: int = 32, - ) -> Tuple[np.ndarray, np.ndarray]: - """Generate both binary and dense embeddings. - - Args: - texts: Single text or iterable of texts - batch_size: Batch size for processing - - Returns: - Tuple of: - - binary_embeddings: Shape (n_texts, 256), uint8 values 0/1 - - dense_embeddings: Shape (n_texts, 2048), float32 - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - binary_embeddings = self._binary_backend.embed_to_numpy(texts) - dense_embeddings = self._dense_backend.embed_to_numpy(texts) - - return binary_embeddings, dense_embeddings - - def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate only binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - Binary embeddings of shape (n_texts, 256) - """ - return self._binary_backend.embed_to_numpy(texts) - - def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate only dense embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, 2048) - """ - return self._dense_backend.embed_to_numpy(texts) - - def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]: - """Generate packed binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - List of packed bytes (32 bytes each) - """ - return self._binary_backend.embed_packed(texts) - - -# ============================================================================= -# Factory Function -# ============================================================================= - - -def get_cascade_embedder( - binary_model: Optional[str] = None, - dense_model: Optional[str] = None, - use_gpu: bool = True, -) -> CascadeEmbeddingBackend: - """Factory function to create a cascade embedder. - - Args: - binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5) - dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5) - use_gpu: Whether to use GPU acceleration - - Returns: - Configured CascadeEmbeddingBackend instance - - Example: - >>> embedder = get_cascade_embedder() - >>> binary, dense = embedder.encode_cascade(["hello world"]) - >>> binary.shape # (1, 256) - >>> dense.shape # (1, 2048) - """ - return CascadeEmbeddingBackend( - binary_model=binary_model, - dense_model=dense_model, - use_gpu=use_gpu, - ) diff --git a/codex-lens/build/lib/codexlens/indexing/symbol_extractor.py b/codex-lens/build/lib/codexlens/indexing/symbol_extractor.py deleted file mode 100644 index 45439e7b..00000000 --- a/codex-lens/build/lib/codexlens/indexing/symbol_extractor.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Symbol and relationship extraction from source code.""" -import re -import sqlite3 -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -try: - from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser -except Exception: # pragma: no cover - optional dependency / platform variance - TreeSitterSymbolParser = None # type: ignore[assignment] - - -class SymbolExtractor: - """Extract symbols and relationships from source code using regex patterns.""" - - # Pattern definitions for different languages - PATTERNS = { - 'python': { - 'function': r'^(?:async\s+)?def\s+(\w+)\s*\(', - 'class': r'^class\s+(\w+)\s*[:\(]', - 'import': r'^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)', - 'call': r'(? None: - """Connect to database and ensure schema exists.""" - self.db_conn = sqlite3.connect(str(self.db_path)) - self._ensure_tables() - - def __enter__(self) -> "SymbolExtractor": - """Context manager entry: connect to database.""" - self.connect() - return self - - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Context manager exit: close database connection.""" - self.close() - - def _ensure_tables(self) -> None: - """Create symbols and relationships tables if they don't exist.""" - if not self.db_conn: - return - cursor = self.db_conn.cursor() - - # Create symbols table with qualified_name - cursor.execute(''' - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - qualified_name TEXT NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - UNIQUE(file_path, name, start_line) - ) - ''') - - # Create relationships table with target_symbol_fqn - cursor.execute(''' - CREATE TABLE IF NOT EXISTS symbol_relationships ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - source_symbol_id INTEGER NOT NULL, - target_symbol_fqn TEXT NOT NULL, - relationship_type TEXT NOT NULL, - file_path TEXT NOT NULL, - line INTEGER, - FOREIGN KEY (source_symbol_id) REFERENCES symbols(id) ON DELETE CASCADE - ) - ''') - - # Create performance indexes - cursor.execute('CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_path)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_source ON symbol_relationships(source_symbol_id)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_target ON symbol_relationships(target_symbol_fqn)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_type ON symbol_relationships(relationship_type)') - - self.db_conn.commit() - - def extract_from_file(self, file_path: Path, content: str) -> Tuple[List[Dict], List[Dict]]: - """Extract symbols and relationships from file content. - - Args: - file_path: Path to the source file - content: File content as string - - Returns: - Tuple of (symbols, relationships) where: - - symbols: List of symbol dicts with qualified_name, name, kind, file_path, start_line, end_line - - relationships: List of relationship dicts with source_scope, target, type, file_path, line - """ - ext = file_path.suffix.lower() - lang = self.LANGUAGE_MAP.get(ext) - - if not lang or lang not in self.PATTERNS: - return [], [] - - patterns = self.PATTERNS[lang] - symbols = [] - relationships: List[Dict] = [] - lines = content.split('\n') - - current_scope = None - - for line_num, line in enumerate(lines, 1): - # Extract function/class definitions - for kind in ['function', 'class']: - if kind in patterns: - match = re.search(patterns[kind], line) - if match: - name = match.group(1) - qualified_name = f"{file_path.stem}.{name}" - symbols.append({ - 'qualified_name': qualified_name, - 'name': name, - 'kind': kind, - 'file_path': str(file_path), - 'start_line': line_num, - 'end_line': line_num, # Simplified - would need proper parsing for actual end - }) - current_scope = name - - if TreeSitterSymbolParser is not None: - try: - ts_parser = TreeSitterSymbolParser(lang, file_path) - if ts_parser.is_available(): - indexed = ts_parser.parse(content, file_path) - if indexed is not None and indexed.relationships: - relationships = [ - { - "source_scope": r.source_symbol, - "target": r.target_symbol, - "type": r.relationship_type.value, - "file_path": str(file_path), - "line": r.source_line, - } - for r in indexed.relationships - ] - except Exception: - relationships = [] - - # Regex fallback for relationships (when tree-sitter is unavailable) - if not relationships: - current_scope = None - for line_num, line in enumerate(lines, 1): - for kind in ['function', 'class']: - if kind in patterns: - match = re.search(patterns[kind], line) - if match: - current_scope = match.group(1) - - # Extract imports - if 'import' in patterns: - match = re.search(patterns['import'], line) - if match: - import_target = match.group(1) or match.group(2) if match.lastindex >= 2 else match.group(1) - if import_target and current_scope: - relationships.append({ - 'source_scope': current_scope, - 'target': import_target.strip(), - 'type': 'imports', - 'file_path': str(file_path), - 'line': line_num, - }) - - # Extract function calls (simplified) - if 'call' in patterns and current_scope: - for match in re.finditer(patterns['call'], line): - call_name = match.group(1) - # Skip common keywords and the current function - if call_name not in ['if', 'for', 'while', 'return', 'print', 'len', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple', current_scope]: - relationships.append({ - 'source_scope': current_scope, - 'target': call_name, - 'type': 'calls', - 'file_path': str(file_path), - 'line': line_num, - }) - - return symbols, relationships - - def save_symbols(self, symbols: List[Dict]) -> Dict[str, int]: - """Save symbols to database and return name->id mapping. - - Args: - symbols: List of symbol dicts with qualified_name, name, kind, file_path, start_line, end_line - - Returns: - Dictionary mapping symbol name to database id - """ - if not self.db_conn or not symbols: - return {} - - cursor = self.db_conn.cursor() - name_to_id = {} - - for sym in symbols: - try: - cursor.execute(''' - INSERT OR IGNORE INTO symbols - (qualified_name, name, kind, file_path, start_line, end_line) - VALUES (?, ?, ?, ?, ?, ?) - ''', (sym['qualified_name'], sym['name'], sym['kind'], - sym['file_path'], sym['start_line'], sym['end_line'])) - - # Get the id - cursor.execute(''' - SELECT id FROM symbols - WHERE file_path = ? AND name = ? AND start_line = ? - ''', (sym['file_path'], sym['name'], sym['start_line'])) - - row = cursor.fetchone() - if row: - name_to_id[sym['name']] = row[0] - except sqlite3.Error: - continue - - self.db_conn.commit() - return name_to_id - - def save_relationships(self, relationships: List[Dict], name_to_id: Dict[str, int]) -> None: - """Save relationships to database. - - Args: - relationships: List of relationship dicts with source_scope, target, type, file_path, line - name_to_id: Dictionary mapping symbol names to database ids - """ - if not self.db_conn or not relationships: - return - - cursor = self.db_conn.cursor() - - for rel in relationships: - source_id = name_to_id.get(rel['source_scope']) - if source_id: - try: - cursor.execute(''' - INSERT INTO symbol_relationships - (source_symbol_id, target_symbol_fqn, relationship_type, file_path, line) - VALUES (?, ?, ?, ?, ?) - ''', (source_id, rel['target'], rel['type'], rel['file_path'], rel['line'])) - except sqlite3.Error: - continue - - self.db_conn.commit() - - def close(self) -> None: - """Close database connection.""" - if self.db_conn: - self.db_conn.close() - self.db_conn = None diff --git a/codex-lens/build/lib/codexlens/lsp/__init__.py b/codex-lens/build/lib/codexlens/lsp/__init__.py deleted file mode 100644 index e2c851e2..00000000 --- a/codex-lens/build/lib/codexlens/lsp/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -"""LSP module for real-time language server integration. - -This module provides: -- LspBridge: HTTP bridge to VSCode language servers -- LspGraphBuilder: Build code association graphs via LSP -- Location: Position in a source file - -Example: - >>> from codexlens.lsp import LspBridge, LspGraphBuilder - >>> - >>> async with LspBridge() as bridge: - ... refs = await bridge.get_references(symbol) - ... graph = await LspGraphBuilder().build_from_seeds(seeds, bridge) -""" - -from codexlens.lsp.lsp_bridge import ( - CacheEntry, - Location, - LspBridge, -) -from codexlens.lsp.lsp_graph_builder import ( - LspGraphBuilder, -) - -# Alias for backward compatibility -GraphBuilder = LspGraphBuilder - -__all__ = [ - "CacheEntry", - "GraphBuilder", - "Location", - "LspBridge", - "LspGraphBuilder", -] diff --git a/codex-lens/build/lib/codexlens/lsp/handlers.py b/codex-lens/build/lib/codexlens/lsp/handlers.py deleted file mode 100644 index 3fb17e40..00000000 --- a/codex-lens/build/lib/codexlens/lsp/handlers.py +++ /dev/null @@ -1,551 +0,0 @@ -"""LSP request handlers for codex-lens. - -This module contains handlers for LSP requests: -- textDocument/definition -- textDocument/completion -- workspace/symbol -- textDocument/didSave -- textDocument/hover -""" - -from __future__ import annotations - -import logging -import re -from pathlib import Path -from typing import List, Optional, Union -from urllib.parse import quote, unquote - -try: - from lsprotocol import types as lsp -except ImportError as exc: - raise ImportError( - "LSP dependencies not installed. Install with: pip install codex-lens[lsp]" - ) from exc - -from codexlens.entities import Symbol -from codexlens.lsp.server import server - -logger = logging.getLogger(__name__) - -# Symbol kind mapping from codex-lens to LSP -SYMBOL_KIND_MAP = { - "class": lsp.SymbolKind.Class, - "function": lsp.SymbolKind.Function, - "method": lsp.SymbolKind.Method, - "variable": lsp.SymbolKind.Variable, - "constant": lsp.SymbolKind.Constant, - "property": lsp.SymbolKind.Property, - "field": lsp.SymbolKind.Field, - "interface": lsp.SymbolKind.Interface, - "module": lsp.SymbolKind.Module, - "namespace": lsp.SymbolKind.Namespace, - "package": lsp.SymbolKind.Package, - "enum": lsp.SymbolKind.Enum, - "enum_member": lsp.SymbolKind.EnumMember, - "struct": lsp.SymbolKind.Struct, - "type": lsp.SymbolKind.TypeParameter, - "type_alias": lsp.SymbolKind.TypeParameter, -} - -# Completion kind mapping from codex-lens to LSP -COMPLETION_KIND_MAP = { - "class": lsp.CompletionItemKind.Class, - "function": lsp.CompletionItemKind.Function, - "method": lsp.CompletionItemKind.Method, - "variable": lsp.CompletionItemKind.Variable, - "constant": lsp.CompletionItemKind.Constant, - "property": lsp.CompletionItemKind.Property, - "field": lsp.CompletionItemKind.Field, - "interface": lsp.CompletionItemKind.Interface, - "module": lsp.CompletionItemKind.Module, - "enum": lsp.CompletionItemKind.Enum, - "enum_member": lsp.CompletionItemKind.EnumMember, - "struct": lsp.CompletionItemKind.Struct, - "type": lsp.CompletionItemKind.TypeParameter, - "type_alias": lsp.CompletionItemKind.TypeParameter, -} - - -def _path_to_uri(path: Union[str, Path]) -> str: - """Convert a file path to a URI. - - Args: - path: File path (string or Path object) - - Returns: - File URI string - """ - path_str = str(Path(path).resolve()) - # Handle Windows paths - if path_str.startswith("/"): - return f"file://{quote(path_str)}" - else: - return f"file:///{quote(path_str.replace(chr(92), '/'))}" - - -def _uri_to_path(uri: str) -> Path: - """Convert a URI to a file path. - - Args: - uri: File URI string - - Returns: - Path object - """ - path = uri.replace("file:///", "").replace("file://", "") - return Path(unquote(path)) - - -def _get_word_at_position(document_text: str, line: int, character: int) -> Optional[str]: - """Extract the word at the given position in the document. - - Args: - document_text: Full document text - line: 0-based line number - character: 0-based character position - - Returns: - Word at position, or None if no word found - """ - lines = document_text.splitlines() - if line >= len(lines): - return None - - line_text = lines[line] - if character > len(line_text): - return None - - # Find word boundaries - word_pattern = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") - for match in word_pattern.finditer(line_text): - if match.start() <= character <= match.end(): - return match.group() - - return None - - -def _get_prefix_at_position(document_text: str, line: int, character: int) -> str: - """Extract the incomplete word prefix at the given position. - - Args: - document_text: Full document text - line: 0-based line number - character: 0-based character position - - Returns: - Prefix string (may be empty) - """ - lines = document_text.splitlines() - if line >= len(lines): - return "" - - line_text = lines[line] - if character > len(line_text): - character = len(line_text) - - # Extract text before cursor - before_cursor = line_text[:character] - - # Find the start of the current word - match = re.search(r"[a-zA-Z_][a-zA-Z0-9_]*$", before_cursor) - if match: - return match.group() - - return "" - - -def symbol_to_location(symbol: Symbol) -> Optional[lsp.Location]: - """Convert a codex-lens Symbol to an LSP Location. - - Args: - symbol: codex-lens Symbol object - - Returns: - LSP Location, or None if symbol has no file - """ - if not symbol.file: - return None - - # LSP uses 0-based lines, codex-lens uses 1-based - start_line = max(0, symbol.range[0] - 1) - end_line = max(0, symbol.range[1] - 1) - - return lsp.Location( - uri=_path_to_uri(symbol.file), - range=lsp.Range( - start=lsp.Position(line=start_line, character=0), - end=lsp.Position(line=end_line, character=0), - ), - ) - - -def _symbol_kind_to_lsp(kind: str) -> lsp.SymbolKind: - """Map codex-lens symbol kind to LSP SymbolKind. - - Args: - kind: codex-lens symbol kind string - - Returns: - LSP SymbolKind - """ - return SYMBOL_KIND_MAP.get(kind.lower(), lsp.SymbolKind.Variable) - - -def _symbol_kind_to_completion_kind(kind: str) -> lsp.CompletionItemKind: - """Map codex-lens symbol kind to LSP CompletionItemKind. - - Args: - kind: codex-lens symbol kind string - - Returns: - LSP CompletionItemKind - """ - return COMPLETION_KIND_MAP.get(kind.lower(), lsp.CompletionItemKind.Text) - - -# ----------------------------------------------------------------------------- -# LSP Request Handlers -# ----------------------------------------------------------------------------- - - -@server.feature(lsp.TEXT_DOCUMENT_DEFINITION) -def lsp_definition( - params: lsp.DefinitionParams, -) -> Optional[Union[lsp.Location, List[lsp.Location]]]: - """Handle textDocument/definition request. - - Finds the definition of the symbol at the cursor position. - """ - if not server.global_index: - logger.debug("No global index available for definition lookup") - return None - - # Get document - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - # Get word at position - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - logger.debug("No word found at position") - return None - - logger.debug("Looking up definition for: %s", word) - - # Search for exact symbol match - try: - symbols = server.global_index.search( - name=word, - limit=10, - prefix_mode=False, # Exact match preferred - ) - - # Filter for exact name match - exact_matches = [s for s in symbols if s.name == word] - if not exact_matches: - # Fall back to prefix search - symbols = server.global_index.search( - name=word, - limit=10, - prefix_mode=True, - ) - exact_matches = [s for s in symbols if s.name == word] - - if not exact_matches: - logger.debug("No definition found for: %s", word) - return None - - # Convert to LSP locations - locations = [] - for sym in exact_matches: - loc = symbol_to_location(sym) - if loc: - locations.append(loc) - - if len(locations) == 1: - return locations[0] - elif locations: - return locations - else: - return None - - except Exception as exc: - logger.error("Error looking up definition: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_REFERENCES) -def lsp_references(params: lsp.ReferenceParams) -> Optional[List[lsp.Location]]: - """Handle textDocument/references request. - - Finds all references to the symbol at the cursor position using - the code_relationships table for accurate call-site tracking. - Falls back to same-name symbol search if search_engine is unavailable. - """ - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - return None - - logger.debug("Finding references for: %s", word) - - try: - # Try using search_engine.search_references() for accurate reference tracking - if server.search_engine and server.workspace_root: - references = server.search_engine.search_references( - symbol_name=word, - source_path=server.workspace_root, - limit=200, - ) - - if references: - locations = [] - for ref in references: - locations.append( - lsp.Location( - uri=_path_to_uri(ref.file_path), - range=lsp.Range( - start=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column, - ), - end=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column + len(word), - ), - ), - ) - ) - return locations if locations else None - - # Fallback: search for symbols with same name using global_index - if server.global_index: - symbols = server.global_index.search( - name=word, - limit=100, - prefix_mode=False, - ) - - # Filter for exact matches - exact_matches = [s for s in symbols if s.name == word] - - locations = [] - for sym in exact_matches: - loc = symbol_to_location(sym) - if loc: - locations.append(loc) - - return locations if locations else None - - return None - - except Exception as exc: - logger.error("Error finding references: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_COMPLETION) -def lsp_completion(params: lsp.CompletionParams) -> Optional[lsp.CompletionList]: - """Handle textDocument/completion request. - - Provides code completion suggestions based on indexed symbols. - """ - if not server.global_index: - return None - - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - prefix = _get_prefix_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not prefix or len(prefix) < 2: - # Require at least 2 characters for completion - return None - - logger.debug("Completing prefix: %s", prefix) - - try: - symbols = server.global_index.search( - name=prefix, - limit=50, - prefix_mode=True, - ) - - if not symbols: - return None - - # Convert to completion items - items = [] - seen_names = set() - - for sym in symbols: - if sym.name in seen_names: - continue - seen_names.add(sym.name) - - items.append( - lsp.CompletionItem( - label=sym.name, - kind=_symbol_kind_to_completion_kind(sym.kind), - detail=f"{sym.kind} - {Path(sym.file).name if sym.file else 'unknown'}", - sort_text=sym.name.lower(), - ) - ) - - return lsp.CompletionList( - is_incomplete=len(symbols) >= 50, - items=items, - ) - - except Exception as exc: - logger.error("Error getting completions: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_HOVER) -def lsp_hover(params: lsp.HoverParams) -> Optional[lsp.Hover]: - """Handle textDocument/hover request. - - Provides hover information for the symbol at the cursor position - using HoverProvider for rich symbol information including - signature, documentation, and location. - """ - if not server.global_index: - return None - - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - return None - - logger.debug("Hover for: %s", word) - - try: - # Use HoverProvider for rich symbol information - from codexlens.lsp.providers import HoverProvider - - provider = HoverProvider(server.global_index, server.registry) - info = provider.get_hover_info(word) - - if not info: - return None - - # Format as markdown with signature and location - content = provider.format_hover_markdown(info) - - return lsp.Hover( - contents=lsp.MarkupContent( - kind=lsp.MarkupKind.Markdown, - value=content, - ), - ) - - except Exception as exc: - logger.error("Error getting hover info: %s", exc) - return None - - -@server.feature(lsp.WORKSPACE_SYMBOL) -def lsp_workspace_symbol( - params: lsp.WorkspaceSymbolParams, -) -> Optional[List[lsp.SymbolInformation]]: - """Handle workspace/symbol request. - - Searches for symbols across the workspace. - """ - if not server.global_index: - return None - - query = params.query - if not query or len(query) < 2: - return None - - logger.debug("Workspace symbol search: %s", query) - - try: - symbols = server.global_index.search( - name=query, - limit=100, - prefix_mode=True, - ) - - if not symbols: - return None - - result = [] - for sym in symbols: - loc = symbol_to_location(sym) - if loc: - result.append( - lsp.SymbolInformation( - name=sym.name, - kind=_symbol_kind_to_lsp(sym.kind), - location=loc, - container_name=Path(sym.file).parent.name if sym.file else None, - ) - ) - - return result if result else None - - except Exception as exc: - logger.error("Error searching workspace symbols: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_DID_SAVE) -def lsp_did_save(params: lsp.DidSaveTextDocumentParams) -> None: - """Handle textDocument/didSave notification. - - Triggers incremental re-indexing of the saved file. - Note: Full incremental indexing requires WatcherManager integration, - which is planned for Phase 2. - """ - file_path = _uri_to_path(params.text_document.uri) - logger.info("File saved: %s", file_path) - - # Phase 1: Just log the save event - # Phase 2 will integrate with WatcherManager for incremental indexing - # if server.watcher_manager: - # server.watcher_manager.trigger_reindex(file_path) - - -@server.feature(lsp.TEXT_DOCUMENT_DID_OPEN) -def lsp_did_open(params: lsp.DidOpenTextDocumentParams) -> None: - """Handle textDocument/didOpen notification.""" - file_path = _uri_to_path(params.text_document.uri) - logger.debug("File opened: %s", file_path) - - -@server.feature(lsp.TEXT_DOCUMENT_DID_CLOSE) -def lsp_did_close(params: lsp.DidCloseTextDocumentParams) -> None: - """Handle textDocument/didClose notification.""" - file_path = _uri_to_path(params.text_document.uri) - logger.debug("File closed: %s", file_path) diff --git a/codex-lens/build/lib/codexlens/lsp/lsp_bridge.py b/codex-lens/build/lib/codexlens/lsp/lsp_bridge.py deleted file mode 100644 index 4f25b055..00000000 --- a/codex-lens/build/lib/codexlens/lsp/lsp_bridge.py +++ /dev/null @@ -1,834 +0,0 @@ -"""LspBridge service for real-time LSP communication with caching. - -This module provides a bridge to communicate with language servers either via: -1. Standalone LSP Manager (direct subprocess communication - default) -2. VSCode Bridge extension (HTTP-based, legacy mode) - -Features: -- Direct communication with language servers (no VSCode dependency) -- Cache with TTL and file modification time invalidation -- Graceful error handling with empty results on failure -- Support for definition, references, hover, and call hierarchy -""" - -from __future__ import annotations - -import asyncio -import os -import time -from collections import OrderedDict -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, TYPE_CHECKING - -if TYPE_CHECKING: - from codexlens.lsp.standalone_manager import StandaloneLspManager - -# Check for optional dependencies -try: - import aiohttp - HAS_AIOHTTP = True -except ImportError: - HAS_AIOHTTP = False - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeSymbolNode, - Range, -) - - -@dataclass -class Location: - """A location in a source file (LSP response format).""" - - file_path: str - line: int - character: int - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary format.""" - return { - "file_path": self.file_path, - "line": self.line, - "character": self.character, - } - - @classmethod - def from_lsp_response(cls, data: Dict[str, Any]) -> "Location": - """Create Location from LSP response format. - - Handles both direct format and VSCode URI format. - """ - # Handle VSCode URI format (file:///path/to/file) - uri = data.get("uri", data.get("file_path", "")) - if uri.startswith("file:///"): - # Windows: file:///C:/path -> C:/path - # Unix: file:///path -> /path - file_path = uri[8:] if uri[8:9].isalpha() and uri[9:10] == ":" else uri[7:] - elif uri.startswith("file://"): - file_path = uri[7:] - else: - file_path = uri - - # Get position from range or direct fields - if "range" in data: - range_data = data["range"] - start = range_data.get("start", {}) - line = start.get("line", 0) + 1 # LSP is 0-based, convert to 1-based - character = start.get("character", 0) + 1 - else: - line = data.get("line", 1) - character = data.get("character", 1) - - return cls(file_path=file_path, line=line, character=character) - - -@dataclass -class CacheEntry: - """A cached LSP response with expiration metadata. - - Attributes: - data: The cached response data - file_mtime: File modification time when cached (for invalidation) - cached_at: Unix timestamp when entry was cached - """ - - data: Any - file_mtime: float - cached_at: float - - -class LspBridge: - """Bridge for real-time LSP communication with language servers. - - By default, uses StandaloneLspManager to directly spawn and communicate - with language servers via JSON-RPC over stdio. No VSCode dependency required. - - For legacy mode, can use VSCode Bridge HTTP server (set use_vscode_bridge=True). - - Features: - - Direct language server communication (default) - - Response caching with TTL and file modification invalidation - - Timeout handling - - Graceful error handling returning empty results - - Example: - # Default: standalone mode (no VSCode needed) - async with LspBridge() as bridge: - refs = await bridge.get_references(symbol) - definition = await bridge.get_definition(symbol) - - # Legacy: VSCode Bridge mode - async with LspBridge(use_vscode_bridge=True) as bridge: - refs = await bridge.get_references(symbol) - """ - - DEFAULT_BRIDGE_URL = "http://127.0.0.1:3457" - DEFAULT_TIMEOUT = 30.0 # seconds (increased for standalone mode) - DEFAULT_CACHE_TTL = 300 # 5 minutes - DEFAULT_MAX_CACHE_SIZE = 1000 # Maximum cache entries - - def __init__( - self, - bridge_url: str = DEFAULT_BRIDGE_URL, - timeout: float = DEFAULT_TIMEOUT, - cache_ttl: int = DEFAULT_CACHE_TTL, - max_cache_size: int = DEFAULT_MAX_CACHE_SIZE, - use_vscode_bridge: bool = False, - workspace_root: Optional[str] = None, - config_file: Optional[str] = None, - ): - """Initialize LspBridge. - - Args: - bridge_url: URL of the VSCode Bridge HTTP server (legacy mode only) - timeout: Request timeout in seconds - cache_ttl: Cache time-to-live in seconds - max_cache_size: Maximum number of cache entries (LRU eviction) - use_vscode_bridge: If True, use VSCode Bridge HTTP mode (requires aiohttp) - workspace_root: Root directory for standalone LSP manager - config_file: Path to lsp-servers.json configuration file - """ - self.bridge_url = bridge_url - self.timeout = timeout - self.cache_ttl = cache_ttl - self.max_cache_size = max_cache_size - self.use_vscode_bridge = use_vscode_bridge - self.workspace_root = workspace_root - self.config_file = config_file - - self.cache: OrderedDict[str, CacheEntry] = OrderedDict() - - # VSCode Bridge mode (legacy) - self._session: Optional["aiohttp.ClientSession"] = None - - # Standalone mode (default) - self._manager: Optional["StandaloneLspManager"] = None - self._manager_started = False - - # Validate dependencies - if use_vscode_bridge and not HAS_AIOHTTP: - raise ImportError( - "aiohttp is required for VSCode Bridge mode: pip install aiohttp" - ) - - async def _ensure_manager(self) -> "StandaloneLspManager": - """Ensure standalone LSP manager is started.""" - if self._manager is None: - from codexlens.lsp.standalone_manager import StandaloneLspManager - self._manager = StandaloneLspManager( - workspace_root=self.workspace_root, - config_file=self.config_file, - timeout=self.timeout, - ) - - if not self._manager_started: - await self._manager.start() - self._manager_started = True - - return self._manager - - async def _get_session(self) -> "aiohttp.ClientSession": - """Get or create the aiohttp session (VSCode Bridge mode only).""" - if not HAS_AIOHTTP: - raise ImportError("aiohttp required for VSCode Bridge mode") - - if self._session is None or self._session.closed: - timeout = aiohttp.ClientTimeout(total=self.timeout) - self._session = aiohttp.ClientSession(timeout=timeout) - return self._session - - async def close(self) -> None: - """Close connections and cleanup resources.""" - # Close VSCode Bridge session - if self._session and not self._session.closed: - await self._session.close() - self._session = None - - # Stop standalone manager - if self._manager and self._manager_started: - await self._manager.stop() - self._manager_started = False - - def _get_file_mtime(self, file_path: str) -> float: - """Get file modification time, or 0 if file doesn't exist.""" - try: - return os.path.getmtime(file_path) - except OSError: - return 0.0 - - def _is_cached(self, cache_key: str, file_path: str) -> bool: - """Check if cache entry is valid. - - Cache is invalid if: - - Entry doesn't exist - - TTL has expired - - File has been modified since caching - - Args: - cache_key: The cache key to check - file_path: Path to source file for mtime check - - Returns: - True if cache is valid and can be used - """ - if cache_key not in self.cache: - return False - - entry = self.cache[cache_key] - now = time.time() - - # Check TTL - if now - entry.cached_at > self.cache_ttl: - del self.cache[cache_key] - return False - - # Check file modification time - current_mtime = self._get_file_mtime(file_path) - if current_mtime != entry.file_mtime: - del self.cache[cache_key] - return False - - # Move to end on access (LRU behavior) - self.cache.move_to_end(cache_key) - return True - - def _cache(self, key: str, file_path: str, data: Any) -> None: - """Store data in cache with LRU eviction. - - Args: - key: Cache key - file_path: Path to source file (for mtime tracking) - data: Data to cache - """ - # Remove oldest entries if at capacity - while len(self.cache) >= self.max_cache_size: - self.cache.popitem(last=False) # Remove oldest (FIFO order) - - # Move to end if key exists (update access order) - if key in self.cache: - self.cache.move_to_end(key) - - self.cache[key] = CacheEntry( - data=data, - file_mtime=self._get_file_mtime(file_path), - cached_at=time.time(), - ) - - def clear_cache(self) -> None: - """Clear all cached entries.""" - self.cache.clear() - - async def _request_vscode_bridge(self, action: str, params: Dict[str, Any]) -> Any: - """Make HTTP request to VSCode Bridge (legacy mode). - - Args: - action: The endpoint/action name (e.g., "get_definition") - params: Request parameters - - Returns: - Response data on success, None on failure - """ - url = f"{self.bridge_url}/{action}" - - try: - session = await self._get_session() - async with session.post(url, json=params) as response: - if response.status != 200: - return None - - data = await response.json() - if data.get("success") is False: - return None - - return data.get("result") - - except asyncio.TimeoutError: - return None - except Exception: - return None - - async def get_references(self, symbol: CodeSymbolNode) -> List[Location]: - """Get all references to a symbol via real-time LSP. - - Args: - symbol: The code symbol to find references for - - Returns: - List of Location objects where the symbol is referenced. - Returns empty list on error or timeout. - """ - cache_key = f"refs:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - locations: List[Location] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_references", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - # Don't cache on connection error (result is None) - if result is None: - return locations - - if isinstance(result, list): - for item in result: - try: - locations.append(Location.from_lsp_response(item)) - except (KeyError, TypeError): - continue - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_references( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - for item in result: - try: - locations.append(Location.from_lsp_response(item)) - except (KeyError, TypeError): - continue - - self._cache(cache_key, symbol.file_path, locations) - return locations - - async def get_definition(self, symbol: CodeSymbolNode) -> Optional[Location]: - """Get symbol definition location. - - Args: - symbol: The code symbol to find definition for - - Returns: - Location of the definition, or None if not found - """ - cache_key = f"def:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - location: Optional[Location] = None - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_definition", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result: - if isinstance(result, list) and len(result) > 0: - try: - location = Location.from_lsp_response(result[0]) - except (KeyError, TypeError): - pass - elif isinstance(result, dict): - try: - location = Location.from_lsp_response(result) - except (KeyError, TypeError): - pass - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_definition( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - if result: - try: - location = Location.from_lsp_response(result) - except (KeyError, TypeError): - pass - - self._cache(cache_key, symbol.file_path, location) - return location - - async def get_call_hierarchy(self, symbol: CodeSymbolNode) -> List[CallHierarchyItem]: - """Get incoming/outgoing calls for a symbol. - - If call hierarchy is not supported by the language server, - falls back to using references. - - Args: - symbol: The code symbol to get call hierarchy for - - Returns: - List of CallHierarchyItem representing callers/callees. - Returns empty list on error or if not supported. - """ - cache_key = f"calls:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - items: List[CallHierarchyItem] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_call_hierarchy", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result is None: - # Fallback: use references - refs = await self.get_references(symbol) - for ref in refs: - items.append(CallHierarchyItem( - name=f"caller@{ref.line}", - kind="reference", - file_path=ref.file_path, - range=Range( - start_line=ref.line, - start_character=ref.character, - end_line=ref.line, - end_character=ref.character, - ), - detail="Inferred from reference", - )) - elif isinstance(result, list): - for item in result: - try: - range_data = item.get("range", {}) - start = range_data.get("start", {}) - end = range_data.get("end", {}) - - items.append(CallHierarchyItem( - name=item.get("name", "unknown"), - kind=item.get("kind", "unknown"), - file_path=item.get("file_path", item.get("uri", "")), - range=Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ), - detail=item.get("detail"), - )) - except (KeyError, TypeError): - continue - else: - # Default: Standalone mode - manager = await self._ensure_manager() - - # Try to get call hierarchy items - hierarchy_items = await manager.get_call_hierarchy_items( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - if hierarchy_items: - # Get incoming calls for each item - for h_item in hierarchy_items: - incoming = await manager.get_incoming_calls(h_item) - for call in incoming: - from_item = call.get("from", {}) - range_data = from_item.get("range", {}) - start = range_data.get("start", {}) - end = range_data.get("end", {}) - - # Parse URI - uri = from_item.get("uri", "") - if uri.startswith("file:///"): - fp = uri[8:] if uri[8:9].isalpha() and uri[9:10] == ":" else uri[7:] - elif uri.startswith("file://"): - fp = uri[7:] - else: - fp = uri - - items.append(CallHierarchyItem( - name=from_item.get("name", "unknown"), - kind=str(from_item.get("kind", "unknown")), - file_path=fp, - range=Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ), - detail=from_item.get("detail"), - )) - else: - # Fallback: use references - refs = await self.get_references(symbol) - for ref in refs: - items.append(CallHierarchyItem( - name=f"caller@{ref.line}", - kind="reference", - file_path=ref.file_path, - range=Range( - start_line=ref.line, - start_character=ref.character, - end_line=ref.line, - end_character=ref.character, - ), - detail="Inferred from reference", - )) - - self._cache(cache_key, symbol.file_path, items) - return items - - async def get_document_symbols(self, file_path: str) -> List[Dict[str, Any]]: - """Get all symbols in a document (batch operation). - - This is more efficient than individual hover queries when processing - multiple locations in the same file. - - Args: - file_path: Path to the source file - - Returns: - List of symbol dictionaries with name, kind, range, etc. - Returns empty list on error or timeout. - """ - cache_key = f"symbols:{file_path}" - - if self._is_cached(cache_key, file_path): - return self.cache[cache_key].data - - symbols: List[Dict[str, Any]] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_document_symbols", { - "file_path": file_path, - }) - - if isinstance(result, list): - symbols = self._flatten_document_symbols(result) - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_document_symbols(file_path) - - if result: - symbols = self._flatten_document_symbols(result) - - self._cache(cache_key, file_path, symbols) - return symbols - - def _flatten_document_symbols( - self, symbols: List[Dict[str, Any]], parent_name: str = "" - ) -> List[Dict[str, Any]]: - """Flatten nested document symbols into a flat list. - - Document symbols can be nested (e.g., methods inside classes). - This flattens them for easier lookup by line number. - - Args: - symbols: List of symbol dictionaries (may be nested) - parent_name: Name of parent symbol for qualification - - Returns: - Flat list of all symbols with their ranges - """ - flat: List[Dict[str, Any]] = [] - - for sym in symbols: - # Add the symbol itself - symbol_entry = { - "name": sym.get("name", "unknown"), - "kind": self._symbol_kind_to_string(sym.get("kind", 0)), - "range": sym.get("range", sym.get("location", {}).get("range", {})), - "selection_range": sym.get("selectionRange", {}), - "detail": sym.get("detail", ""), - "parent": parent_name, - } - flat.append(symbol_entry) - - # Recursively process children - children = sym.get("children", []) - if children: - qualified_name = sym.get("name", "") - if parent_name: - qualified_name = f"{parent_name}.{qualified_name}" - flat.extend(self._flatten_document_symbols(children, qualified_name)) - - return flat - - def _symbol_kind_to_string(self, kind: int) -> str: - """Convert LSP SymbolKind integer to string. - - Args: - kind: LSP SymbolKind enum value - - Returns: - Human-readable string representation - """ - # LSP SymbolKind enum (1-indexed) - kinds = { - 1: "file", - 2: "module", - 3: "namespace", - 4: "package", - 5: "class", - 6: "method", - 7: "property", - 8: "field", - 9: "constructor", - 10: "enum", - 11: "interface", - 12: "function", - 13: "variable", - 14: "constant", - 15: "string", - 16: "number", - 17: "boolean", - 18: "array", - 19: "object", - 20: "key", - 21: "null", - 22: "enum_member", - 23: "struct", - 24: "event", - 25: "operator", - 26: "type_parameter", - } - return kinds.get(kind, "unknown") - - async def get_hover(self, symbol: CodeSymbolNode) -> Optional[str]: - """Get hover documentation for a symbol. - - Args: - symbol: The code symbol to get hover info for - - Returns: - Hover documentation as string, or None if not available - """ - cache_key = f"hover:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - hover_text: Optional[str] = None - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_hover", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result: - hover_text = self._parse_hover_result(result) - else: - # Default: Standalone mode - manager = await self._ensure_manager() - hover_text = await manager.get_hover( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - self._cache(cache_key, symbol.file_path, hover_text) - return hover_text - - def _parse_hover_result(self, result: Any) -> Optional[str]: - """Parse hover result into string.""" - if isinstance(result, str): - return result - elif isinstance(result, list): - parts = [] - for item in result: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict): - value = item.get("value", item.get("contents", "")) - if value: - parts.append(str(value)) - return "\n\n".join(parts) if parts else None - elif isinstance(result, dict): - contents = result.get("contents", result.get("value", "")) - if isinstance(contents, str): - return contents - elif isinstance(contents, list): - parts = [] - for c in contents: - if isinstance(c, str): - parts.append(c) - elif isinstance(c, dict): - parts.append(str(c.get("value", ""))) - return "\n\n".join(parts) if parts else None - return None - - async def __aenter__(self) -> "LspBridge": - """Async context manager entry.""" - return self - - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Async context manager exit - close connections.""" - await self.close() - - -# Simple test -if __name__ == "__main__": - import sys - - async def test_lsp_bridge(): - """Simple test of LspBridge functionality.""" - print("Testing LspBridge (Standalone Mode)...") - print(f"Timeout: {LspBridge.DEFAULT_TIMEOUT}s") - print(f"Cache TTL: {LspBridge.DEFAULT_CACHE_TTL}s") - print() - - # Create a test symbol pointing to this file - test_file = os.path.abspath(__file__) - test_symbol = CodeSymbolNode( - id=f"{test_file}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=test_file, - range=Range( - start_line=96, - start_character=1, - end_line=200, - end_character=1, - ), - ) - - print(f"Test symbol: {test_symbol.name} in {os.path.basename(test_symbol.file_path)}") - print() - - # Use standalone mode (default) - async with LspBridge( - workspace_root=str(Path(__file__).parent.parent.parent.parent), - ) as bridge: - print("1. Testing get_document_symbols...") - try: - symbols = await bridge.get_document_symbols(test_file) - print(f" Found {len(symbols)} symbols") - for sym in symbols[:5]: - print(f" - {sym.get('name')} ({sym.get('kind')})") - except Exception as e: - print(f" Error: {e}") - - print() - print("2. Testing get_definition...") - try: - definition = await bridge.get_definition(test_symbol) - if definition: - print(f" Definition: {os.path.basename(definition.file_path)}:{definition.line}") - else: - print(" No definition found") - except Exception as e: - print(f" Error: {e}") - - print() - print("3. Testing get_references...") - try: - refs = await bridge.get_references(test_symbol) - print(f" Found {len(refs)} references") - for ref in refs[:3]: - print(f" - {os.path.basename(ref.file_path)}:{ref.line}") - except Exception as e: - print(f" Error: {e}") - - print() - print("4. Testing get_hover...") - try: - hover = await bridge.get_hover(test_symbol) - if hover: - print(f" Hover: {hover[:100]}...") - else: - print(" No hover info found") - except Exception as e: - print(f" Error: {e}") - - print() - print("5. Testing get_call_hierarchy...") - try: - calls = await bridge.get_call_hierarchy(test_symbol) - print(f" Found {len(calls)} call hierarchy items") - for call in calls[:3]: - print(f" - {call.name} in {os.path.basename(call.file_path)}") - except Exception as e: - print(f" Error: {e}") - - print() - print("6. Testing cache...") - print(f" Cache entries: {len(bridge.cache)}") - for key in list(bridge.cache.keys())[:5]: - print(f" - {key}") - - print() - print("Test complete!") - - # Run the test - # Note: On Windows, use default ProactorEventLoop (supports subprocess creation) - - asyncio.run(test_lsp_bridge()) diff --git a/codex-lens/build/lib/codexlens/lsp/lsp_graph_builder.py b/codex-lens/build/lib/codexlens/lsp/lsp_graph_builder.py deleted file mode 100644 index b5f42a75..00000000 --- a/codex-lens/build/lib/codexlens/lsp/lsp_graph_builder.py +++ /dev/null @@ -1,375 +0,0 @@ -"""Graph builder for code association graphs via LSP.""" - -from __future__ import annotations - -import asyncio -import logging -from typing import Any, Dict, List, Optional, Set, Tuple - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, -) -from codexlens.lsp.lsp_bridge import ( - Location, - LspBridge, -) - -logger = logging.getLogger(__name__) - - -class LspGraphBuilder: - """Builds code association graph by expanding from seed symbols using LSP.""" - - def __init__( - self, - max_depth: int = 2, - max_nodes: int = 100, - max_concurrent: int = 10, - ): - """Initialize GraphBuilder. - - Args: - max_depth: Maximum depth for BFS expansion from seeds. - max_nodes: Maximum number of nodes in the graph. - max_concurrent: Maximum concurrent LSP requests. - """ - self.max_depth = max_depth - self.max_nodes = max_nodes - self.max_concurrent = max_concurrent - # Cache for document symbols per file (avoids per-location hover queries) - self._document_symbols_cache: Dict[str, List[Dict[str, Any]]] = {} - - async def build_from_seeds( - self, - seeds: List[CodeSymbolNode], - lsp_bridge: LspBridge, - ) -> CodeAssociationGraph: - """Build association graph by BFS expansion from seeds. - - For each seed: - 1. Get references via LSP - 2. Get call hierarchy via LSP - 3. Add nodes and edges to graph - 4. Continue expanding until max_depth or max_nodes reached - - Args: - seeds: Initial seed symbols to expand from. - lsp_bridge: LSP bridge for querying language servers. - - Returns: - CodeAssociationGraph with expanded nodes and relationships. - """ - graph = CodeAssociationGraph() - visited: Set[str] = set() - semaphore = asyncio.Semaphore(self.max_concurrent) - - # Initialize queue with seeds at depth 0 - queue: List[Tuple[CodeSymbolNode, int]] = [(s, 0) for s in seeds] - - # Add seed nodes to graph - for seed in seeds: - graph.add_node(seed) - - # BFS expansion - while queue and len(graph.nodes) < self.max_nodes: - # Take a batch of nodes from queue - batch_size = min(self.max_concurrent, len(queue)) - batch = queue[:batch_size] - queue = queue[batch_size:] - - # Expand nodes in parallel - tasks = [ - self._expand_node( - node, depth, graph, lsp_bridge, visited, semaphore - ) - for node, depth in batch - ] - - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results and add new nodes to queue - for result in results: - if isinstance(result, Exception): - logger.warning("Error expanding node: %s", result) - continue - if result: - # Add new nodes to queue if not at max depth - for new_node, new_depth in result: - if ( - new_depth <= self.max_depth - and len(graph.nodes) < self.max_nodes - ): - queue.append((new_node, new_depth)) - - return graph - - async def _expand_node( - self, - node: CodeSymbolNode, - depth: int, - graph: CodeAssociationGraph, - lsp_bridge: LspBridge, - visited: Set[str], - semaphore: asyncio.Semaphore, - ) -> List[Tuple[CodeSymbolNode, int]]: - """Expand a single node, return new nodes to process. - - Args: - node: Node to expand. - depth: Current depth in BFS. - graph: Graph to add nodes and edges to. - lsp_bridge: LSP bridge for queries. - visited: Set of visited node IDs. - semaphore: Semaphore for concurrency control. - - Returns: - List of (new_node, new_depth) tuples to add to queue. - """ - # Skip if already visited or at max depth - if node.id in visited: - return [] - if depth > self.max_depth: - return [] - if len(graph.nodes) >= self.max_nodes: - return [] - - visited.add(node.id) - new_nodes: List[Tuple[CodeSymbolNode, int]] = [] - - async with semaphore: - # Get relationships in parallel - try: - refs_task = lsp_bridge.get_references(node) - calls_task = lsp_bridge.get_call_hierarchy(node) - - refs, calls = await asyncio.gather( - refs_task, calls_task, return_exceptions=True - ) - - # Handle reference results - if isinstance(refs, Exception): - logger.debug( - "Failed to get references for %s: %s", node.id, refs - ) - refs = [] - - # Handle call hierarchy results - if isinstance(calls, Exception): - logger.debug( - "Failed to get call hierarchy for %s: %s", - node.id, - calls, - ) - calls = [] - - # Process references - for ref in refs: - if len(graph.nodes) >= self.max_nodes: - break - - ref_node = await self._location_to_node(ref, lsp_bridge) - if ref_node and ref_node.id != node.id: - if ref_node.id not in graph.nodes: - graph.add_node(ref_node) - new_nodes.append((ref_node, depth + 1)) - # Use add_edge since both nodes should exist now - graph.add_edge(node.id, ref_node.id, "references") - - # Process call hierarchy (incoming calls) - for call in calls: - if len(graph.nodes) >= self.max_nodes: - break - - call_node = await self._call_hierarchy_to_node( - call, lsp_bridge - ) - if call_node and call_node.id != node.id: - if call_node.id not in graph.nodes: - graph.add_node(call_node) - new_nodes.append((call_node, depth + 1)) - # Incoming call: call_node calls node - graph.add_edge(call_node.id, node.id, "calls") - - except Exception as e: - logger.warning( - "Error during node expansion for %s: %s", node.id, e - ) - - return new_nodes - - def clear_cache(self) -> None: - """Clear the document symbols cache. - - Call this between searches to free memory and ensure fresh data. - """ - self._document_symbols_cache.clear() - - async def _get_symbol_at_location( - self, - file_path: str, - line: int, - lsp_bridge: LspBridge, - ) -> Optional[Dict[str, Any]]: - """Find symbol at location using cached document symbols. - - This is much more efficient than individual hover queries because - document symbols are fetched once per file and cached. - - Args: - file_path: Path to the source file. - line: Line number (1-based). - lsp_bridge: LSP bridge for fetching document symbols. - - Returns: - Symbol dictionary with name, kind, range, etc., or None if not found. - """ - # Get or fetch document symbols for this file - if file_path not in self._document_symbols_cache: - symbols = await lsp_bridge.get_document_symbols(file_path) - self._document_symbols_cache[file_path] = symbols - - symbols = self._document_symbols_cache[file_path] - - # Find symbol containing this line (best match = smallest range) - best_match: Optional[Dict[str, Any]] = None - best_range_size = float("inf") - - for symbol in symbols: - sym_range = symbol.get("range", {}) - start = sym_range.get("start", {}) - end = sym_range.get("end", {}) - - # LSP ranges are 0-based, our line is 1-based - start_line = start.get("line", 0) + 1 - end_line = end.get("line", 0) + 1 - - if start_line <= line <= end_line: - range_size = end_line - start_line - if range_size < best_range_size: - best_match = symbol - best_range_size = range_size - - return best_match - - async def _location_to_node( - self, - location: Location, - lsp_bridge: LspBridge, - ) -> Optional[CodeSymbolNode]: - """Convert LSP location to CodeSymbolNode. - - Uses cached document symbols instead of individual hover queries - for better performance. - - Args: - location: LSP location to convert. - lsp_bridge: LSP bridge for additional queries. - - Returns: - CodeSymbolNode or None if conversion fails. - """ - try: - file_path = location.file_path - start_line = location.line - - # Try to find symbol info from cached document symbols (fast) - symbol_info = await self._get_symbol_at_location( - file_path, start_line, lsp_bridge - ) - - if symbol_info: - name = symbol_info.get("name", f"symbol_L{start_line}") - kind = symbol_info.get("kind", "unknown") - - # Extract range from symbol if available - sym_range = symbol_info.get("range", {}) - start = sym_range.get("start", {}) - end = sym_range.get("end", {}) - - location_range = Range( - start_line=start.get("line", start_line - 1) + 1, - start_character=start.get("character", location.character - 1) + 1, - end_line=end.get("line", start_line - 1) + 1, - end_character=end.get("character", location.character - 1) + 1, - ) - else: - # Fallback to basic node without symbol info - name = f"symbol_L{start_line}" - kind = "unknown" - location_range = Range( - start_line=location.line, - start_character=location.character, - end_line=location.line, - end_character=location.character, - ) - - node_id = self._create_node_id(file_path, name, start_line) - - return CodeSymbolNode( - id=node_id, - name=name, - kind=kind, - file_path=file_path, - range=location_range, - docstring="", # Skip hover for performance - ) - - except Exception as e: - logger.debug("Failed to convert location to node: %s", e) - return None - - async def _call_hierarchy_to_node( - self, - call_item: CallHierarchyItem, - lsp_bridge: LspBridge, - ) -> Optional[CodeSymbolNode]: - """Convert CallHierarchyItem to CodeSymbolNode. - - Args: - call_item: Call hierarchy item to convert. - lsp_bridge: LSP bridge (unused, kept for API consistency). - - Returns: - CodeSymbolNode or None if conversion fails. - """ - try: - file_path = call_item.file_path - name = call_item.name - start_line = call_item.range.start_line - # CallHierarchyItem.kind is already a string - kind = call_item.kind - - node_id = self._create_node_id(file_path, name, start_line) - - return CodeSymbolNode( - id=node_id, - name=name, - kind=kind, - file_path=file_path, - range=call_item.range, - docstring=call_item.detail or "", - ) - - except Exception as e: - logger.debug( - "Failed to convert call hierarchy item to node: %s", e - ) - return None - - def _create_node_id( - self, file_path: str, name: str, line: int - ) -> str: - """Create unique node ID. - - Args: - file_path: Path to the file. - name: Symbol name. - line: Line number (0-based). - - Returns: - Unique node ID string. - """ - return f"{file_path}:{name}:{line}" diff --git a/codex-lens/build/lib/codexlens/lsp/providers.py b/codex-lens/build/lib/codexlens/lsp/providers.py deleted file mode 100644 index d0275437..00000000 --- a/codex-lens/build/lib/codexlens/lsp/providers.py +++ /dev/null @@ -1,177 +0,0 @@ -"""LSP feature providers.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, TYPE_CHECKING - -if TYPE_CHECKING: - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.storage.registry import RegistryStore - -logger = logging.getLogger(__name__) - - -@dataclass -class HoverInfo: - """Hover information for a symbol.""" - - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: tuple # (start_line, end_line) - - -class HoverProvider: - """Provides hover information for symbols.""" - - def __init__( - self, - global_index: "GlobalSymbolIndex", - registry: Optional["RegistryStore"] = None, - ) -> None: - """Initialize hover provider. - - Args: - global_index: Global symbol index for lookups - registry: Optional registry store for index path resolution - """ - self.global_index = global_index - self.registry = registry - - def get_hover_info(self, symbol_name: str) -> Optional[HoverInfo]: - """Get hover information for a symbol. - - Args: - symbol_name: Name of the symbol to look up - - Returns: - HoverInfo or None if symbol not found - """ - # Look up symbol in global index using exact match - symbols = self.global_index.search( - name=symbol_name, - limit=1, - prefix_mode=False, - ) - - # Filter for exact name match - exact_matches = [s for s in symbols if s.name == symbol_name] - - if not exact_matches: - return None - - symbol = exact_matches[0] - - # Extract signature from source file - signature = self._extract_signature(symbol) - - # Symbol uses 'file' attribute and 'range' tuple - file_path = symbol.file or "" - start_line, end_line = symbol.range - - return HoverInfo( - name=symbol.name, - kind=symbol.kind, - signature=signature, - documentation=None, # Symbol doesn't have docstring field - file_path=file_path, - line_range=(start_line, end_line), - ) - - def _extract_signature(self, symbol) -> str: - """Extract function/class signature from source file. - - Args: - symbol: Symbol object with file and range information - - Returns: - Extracted signature string or fallback kind + name - """ - try: - file_path = Path(symbol.file) if symbol.file else None - if not file_path or not file_path.exists(): - return f"{symbol.kind} {symbol.name}" - - content = file_path.read_text(encoding="utf-8", errors="ignore") - lines = content.split("\n") - - # Extract signature lines (first line of definition + continuation) - start_line = symbol.range[0] - 1 # Convert 1-based to 0-based - if start_line >= len(lines) or start_line < 0: - return f"{symbol.kind} {symbol.name}" - - signature_lines = [] - first_line = lines[start_line] - signature_lines.append(first_line) - - # Continue if multiline signature (no closing paren + colon yet) - # Look for patterns like "def func(", "class Foo(", etc. - i = start_line + 1 - max_lines = min(start_line + 5, len(lines)) - while i < max_lines: - line = signature_lines[-1] - # Stop if we see closing pattern - if "):" in line or line.rstrip().endswith(":"): - break - signature_lines.append(lines[i]) - i += 1 - - return "\n".join(signature_lines) - - except Exception as e: - logger.debug(f"Failed to extract signature for {symbol.name}: {e}") - return f"{symbol.kind} {symbol.name}" - - def format_hover_markdown(self, info: HoverInfo) -> str: - """Format hover info as Markdown. - - Args: - info: HoverInfo object to format - - Returns: - Markdown-formatted hover content - """ - parts = [] - - # Detect language for code fence based on file extension - ext = Path(info.file_path).suffix.lower() if info.file_path else "" - lang_map = { - ".py": "python", - ".js": "javascript", - ".ts": "typescript", - ".tsx": "typescript", - ".jsx": "javascript", - ".java": "java", - ".go": "go", - ".rs": "rust", - ".c": "c", - ".cpp": "cpp", - ".h": "c", - ".hpp": "cpp", - ".cs": "csharp", - ".rb": "ruby", - ".php": "php", - } - lang = lang_map.get(ext, "") - - # Code block with signature - parts.append(f"```{lang}\n{info.signature}\n```") - - # Documentation if available - if info.documentation: - parts.append(f"\n---\n\n{info.documentation}") - - # Location info - file_name = Path(info.file_path).name if info.file_path else "unknown" - parts.append( - f"\n---\n\n*{info.kind}* defined in " - f"`{file_name}` " - f"(line {info.line_range[0]})" - ) - - return "\n".join(parts) diff --git a/codex-lens/build/lib/codexlens/lsp/server.py b/codex-lens/build/lib/codexlens/lsp/server.py deleted file mode 100644 index 809bba9e..00000000 --- a/codex-lens/build/lib/codexlens/lsp/server.py +++ /dev/null @@ -1,263 +0,0 @@ -"""codex-lens LSP Server implementation using pygls. - -This module provides the main Language Server class and entry point. -""" - -from __future__ import annotations - -import argparse -import logging -import sys -from pathlib import Path -from typing import Optional - -try: - from lsprotocol import types as lsp - from pygls.lsp.server import LanguageServer -except ImportError as exc: - raise ImportError( - "LSP dependencies not installed. Install with: pip install codex-lens[lsp]" - ) from exc - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -logger = logging.getLogger(__name__) - - -class CodexLensLanguageServer(LanguageServer): - """Language Server for codex-lens code indexing. - - Provides IDE features using codex-lens symbol index: - - Go to Definition - - Find References - - Code Completion - - Hover Information - - Workspace Symbol Search - - Attributes: - registry: Global project registry for path lookups - mapper: Path mapper for source/index conversions - global_index: Project-wide symbol index - search_engine: Chain search engine for symbol search - workspace_root: Current workspace root path - """ - - def __init__(self) -> None: - super().__init__(name="codexlens-lsp", version="0.1.0") - - self.registry: Optional[RegistryStore] = None - self.mapper: Optional[PathMapper] = None - self.global_index: Optional[GlobalSymbolIndex] = None - self.search_engine: Optional[ChainSearchEngine] = None - self.workspace_root: Optional[Path] = None - self._config: Optional[Config] = None - - def initialize_components(self, workspace_root: Path) -> bool: - """Initialize codex-lens components for the workspace. - - Args: - workspace_root: Root path of the workspace - - Returns: - True if initialization succeeded, False otherwise - """ - self.workspace_root = workspace_root.resolve() - logger.info("Initializing codex-lens for workspace: %s", self.workspace_root) - - try: - # Initialize registry - self.registry = RegistryStore() - self.registry.initialize() - - # Initialize path mapper - self.mapper = PathMapper() - - # Try to find project in registry - project_info = self.registry.find_by_source_path(str(self.workspace_root)) - - if project_info: - project_id = int(project_info["id"]) - index_root = Path(project_info["index_root"]) - - # Initialize global symbol index - global_db = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - self.global_index = GlobalSymbolIndex(global_db, project_id) - self.global_index.initialize() - - # Initialize search engine - self._config = Config() - self.search_engine = ChainSearchEngine( - registry=self.registry, - mapper=self.mapper, - config=self._config, - ) - - logger.info("codex-lens initialized for project: %s", project_info["source_root"]) - return True - else: - logger.warning( - "Workspace not indexed by codex-lens: %s. " - "Run 'codexlens index %s' to index first.", - self.workspace_root, - self.workspace_root, - ) - return False - - except Exception as exc: - logger.error("Failed to initialize codex-lens: %s", exc) - return False - - def shutdown_components(self) -> None: - """Clean up codex-lens components.""" - if self.global_index: - try: - self.global_index.close() - except Exception as exc: - logger.debug("Error closing global index: %s", exc) - self.global_index = None - - if self.search_engine: - try: - self.search_engine.close() - except Exception as exc: - logger.debug("Error closing search engine: %s", exc) - self.search_engine = None - - if self.registry: - try: - self.registry.close() - except Exception as exc: - logger.debug("Error closing registry: %s", exc) - self.registry = None - - -# Create server instance -server = CodexLensLanguageServer() - - -@server.feature(lsp.INITIALIZE) -def lsp_initialize(params: lsp.InitializeParams) -> lsp.InitializeResult: - """Handle LSP initialize request.""" - logger.info("LSP initialize request received") - - # Get workspace root - workspace_root: Optional[Path] = None - if params.root_uri: - workspace_root = Path(params.root_uri.replace("file://", "").replace("file:", "")) - elif params.root_path: - workspace_root = Path(params.root_path) - - if workspace_root: - server.initialize_components(workspace_root) - - # Declare server capabilities - return lsp.InitializeResult( - capabilities=lsp.ServerCapabilities( - text_document_sync=lsp.TextDocumentSyncOptions( - open_close=True, - change=lsp.TextDocumentSyncKind.Incremental, - save=lsp.SaveOptions(include_text=False), - ), - definition_provider=True, - references_provider=True, - completion_provider=lsp.CompletionOptions( - trigger_characters=[".", ":"], - resolve_provider=False, - ), - hover_provider=True, - workspace_symbol_provider=True, - ), - server_info=lsp.ServerInfo( - name="codexlens-lsp", - version="0.1.0", - ), - ) - - -@server.feature(lsp.SHUTDOWN) -def lsp_shutdown(params: None) -> None: - """Handle LSP shutdown request.""" - logger.info("LSP shutdown request received") - server.shutdown_components() - - -def main() -> int: - """Entry point for codexlens-lsp command. - - Returns: - Exit code (0 for success) - """ - # Import handlers to register them with the server - # This must be done before starting the server - import codexlens.lsp.handlers # noqa: F401 - - parser = argparse.ArgumentParser( - description="codex-lens Language Server", - prog="codexlens-lsp", - ) - parser.add_argument( - "--stdio", - action="store_true", - default=True, - help="Use stdio for communication (default)", - ) - parser.add_argument( - "--tcp", - action="store_true", - help="Use TCP for communication", - ) - parser.add_argument( - "--host", - default="127.0.0.1", - help="TCP host (default: 127.0.0.1)", - ) - parser.add_argument( - "--port", - type=int, - default=2087, - help="TCP port (default: 2087)", - ) - parser.add_argument( - "--log-level", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - default="INFO", - help="Log level (default: INFO)", - ) - parser.add_argument( - "--log-file", - help="Log file path (optional)", - ) - - args = parser.parse_args() - - # Configure logging - log_handlers = [] - if args.log_file: - log_handlers.append(logging.FileHandler(args.log_file)) - else: - log_handlers.append(logging.StreamHandler(sys.stderr)) - - logging.basicConfig( - level=getattr(logging, args.log_level), - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=log_handlers, - ) - - logger.info("Starting codexlens-lsp server") - - if args.tcp: - logger.info("Starting TCP server on %s:%d", args.host, args.port) - server.start_tcp(args.host, args.port) - else: - logger.info("Starting stdio server") - server.start_io() - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/codex-lens/build/lib/codexlens/lsp/standalone_manager.py b/codex-lens/build/lib/codexlens/lsp/standalone_manager.py deleted file mode 100644 index aa6edf6b..00000000 --- a/codex-lens/build/lib/codexlens/lsp/standalone_manager.py +++ /dev/null @@ -1,1159 +0,0 @@ -"""Standalone Language Server Manager for direct LSP communication. - -This module provides direct communication with language servers via JSON-RPC over stdio, -eliminating the need for VSCode Bridge. Similar to cclsp architecture. - -Features: -- Direct subprocess spawning of language servers -- JSON-RPC 2.0 communication over stdin/stdout -- Multi-language support via configuration file (lsp-servers.json) -- Process lifecycle management with auto-restart -- Compatible interface with existing LspBridge -""" - -from __future__ import annotations - -import asyncio -import json -import logging -import os -import sys -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -logger = logging.getLogger(__name__) - - -@dataclass -class ServerConfig: - """Configuration for a language server.""" - - language_id: str - display_name: str - extensions: List[str] - command: List[str] - enabled: bool = True - initialization_options: Dict[str, Any] = field(default_factory=dict) - settings: Dict[str, Any] = field(default_factory=dict) - root_dir: str = "." - timeout: int = 30000 # ms - restart_interval: int = 5000 # ms - max_restarts: int = 3 - - -@dataclass -class ServerState: - """State of a running language server.""" - - config: ServerConfig - process: asyncio.subprocess.Process - reader: asyncio.StreamReader - writer: asyncio.StreamWriter - request_id: int = 0 - initialized: bool = False - capabilities: Dict[str, Any] = field(default_factory=dict) - pending_requests: Dict[int, asyncio.Future] = field(default_factory=dict) - restart_count: int = 0 - # Queue for producer-consumer pattern - continuous reading puts messages here - message_queue: asyncio.Queue = field(default_factory=asyncio.Queue) - - -class StandaloneLspManager: - """Manager for direct language server communication. - - Spawns language servers as subprocesses and communicates via JSON-RPC - over stdin/stdout. No VSCode or GUI dependency required. - - Example: - manager = StandaloneLspManager(workspace_root="/path/to/project") - await manager.start() - - definition = await manager.get_definition( - file_path="src/main.py", - line=10, - character=5 - ) - - await manager.stop() - """ - - DEFAULT_CONFIG_FILE = "lsp-servers.json" - - def __init__( - self, - workspace_root: Optional[str] = None, - config_file: Optional[str] = None, - timeout: float = 30.0, - ): - """Initialize StandaloneLspManager. - - Args: - workspace_root: Root directory of the workspace (used for rootUri) - config_file: Path to lsp-servers.json configuration file - timeout: Default timeout for LSP requests in seconds - """ - self.workspace_root = Path(workspace_root or os.getcwd()).resolve() - self.config_file = config_file - self.timeout = timeout - - self._servers: Dict[str, ServerState] = {} # language_id -> ServerState - self._extension_map: Dict[str, str] = {} # extension -> language_id - self._configs: Dict[str, ServerConfig] = {} # language_id -> ServerConfig - self._read_tasks: Dict[str, asyncio.Task] = {} # language_id -> read task - self._stderr_tasks: Dict[str, asyncio.Task] = {} # language_id -> stderr read task - self._lock = asyncio.Lock() - - def _find_config_file(self) -> Optional[Path]: - """Find the lsp-servers.json configuration file. - - Search order: - 1. Explicit config_file parameter - 2. {workspace_root}/lsp-servers.json - 3. {workspace_root}/.codexlens/lsp-servers.json - 4. Package default (codexlens/lsp-servers.json) - """ - search_paths = [] - - if self.config_file: - search_paths.append(Path(self.config_file)) - - search_paths.extend([ - self.workspace_root / self.DEFAULT_CONFIG_FILE, - self.workspace_root / ".codexlens" / self.DEFAULT_CONFIG_FILE, - Path(__file__).parent.parent.parent.parent / self.DEFAULT_CONFIG_FILE, # package root - ]) - - for path in search_paths: - if path.exists(): - return path - - return None - - def _load_config(self) -> None: - """Load language server configuration from JSON file.""" - config_path = self._find_config_file() - - if not config_path: - logger.warning(f"No {self.DEFAULT_CONFIG_FILE} found, using empty config") - return - - try: - with open(config_path, "r", encoding="utf-8") as f: - data = json.load(f) - except Exception as e: - logger.error(f"Failed to load config from {config_path}: {e}") - return - - # Parse defaults - defaults = data.get("defaults", {}) - default_timeout = defaults.get("timeout", 30000) - default_restart_interval = defaults.get("restartInterval", 5000) - default_max_restarts = defaults.get("maxRestarts", 3) - - # Parse servers - for server_data in data.get("servers", []): - if not server_data.get("enabled", True): - continue - - language_id = server_data.get("languageId", "") - if not language_id: - continue - - config = ServerConfig( - language_id=language_id, - display_name=server_data.get("displayName", language_id), - extensions=server_data.get("extensions", []), - command=server_data.get("command", []), - enabled=server_data.get("enabled", True), - initialization_options=server_data.get("initializationOptions", {}), - settings=server_data.get("settings", {}), - root_dir=server_data.get("rootDir", defaults.get("rootDir", ".")), - timeout=server_data.get("timeout", default_timeout), - restart_interval=server_data.get("restartInterval", default_restart_interval), - max_restarts=server_data.get("maxRestarts", default_max_restarts), - ) - - self._configs[language_id] = config - - # Build extension map - for ext in config.extensions: - self._extension_map[ext.lower()] = language_id - - logger.info(f"Loaded {len(self._configs)} language server configs from {config_path}") - - def get_language_id(self, file_path: str) -> Optional[str]: - """Get language ID for a file based on extension. - - Args: - file_path: Path to the file - - Returns: - Language ID (e.g., "python", "typescript") or None if unknown - """ - ext = Path(file_path).suffix.lstrip(".").lower() - return self._extension_map.get(ext) - - async def start(self) -> None: - """Initialize the manager and load configuration. - - This does NOT start any language servers yet - they are started - on-demand when first needed for a file type. - """ - self._load_config() - logger.info(f"StandaloneLspManager started for workspace: {self.workspace_root}") - - async def stop(self) -> None: - """Stop all running language servers and cleanup.""" - async with self._lock: - for language_id in list(self._servers.keys()): - await self._stop_server(language_id) - - logger.info("StandaloneLspManager stopped") - - async def _start_server(self, language_id: str) -> Optional[ServerState]: - """Start a language server for the given language. - - Args: - language_id: The language ID (e.g., "python") - - Returns: - ServerState if successful, None on failure - """ - config = self._configs.get(language_id) - if not config: - logger.error(f"No configuration for language: {language_id}") - return None - - if not config.command: - logger.error(f"No command specified for {language_id}") - return None - - try: - logger.info(f"Starting {config.display_name}: {' '.join(config.command)}") - - # Spawn the language server process - process = await asyncio.create_subprocess_exec( - *config.command, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(self.workspace_root), - ) - - if process.stdin is None or process.stdout is None: - logger.error(f"Failed to get stdin/stdout for {language_id}") - process.terminate() - return None - - state = ServerState( - config=config, - process=process, - reader=process.stdout, - writer=process.stdin, - ) - - self._servers[language_id] = state - - # Start reading stderr in background (prevents pipe buffer from filling up) - if process.stderr: - self._stderr_tasks[language_id] = asyncio.create_task( - self._read_stderr(language_id, process.stderr) - ) - - # CRITICAL: Start the continuous reader task IMMEDIATELY before any communication - # This ensures no messages are lost during initialization handshake - self._read_tasks[language_id] = asyncio.create_task( - self._continuous_reader(language_id) - ) - - # Start the message processor task to handle queued messages - asyncio.create_task(self._process_messages(language_id)) - - # Initialize the server - now uses queue for reading responses - await self._initialize_server(state) - - logger.info(f"{config.display_name} started and initialized") - return state - - except FileNotFoundError: - logger.error( - f"Language server not found: {config.command[0]}. " - f"Install it with the appropriate package manager." - ) - return None - except Exception as e: - logger.error(f"Failed to start {language_id}: {e}") - return None - - async def _stop_server(self, language_id: str) -> None: - """Stop a language server.""" - state = self._servers.pop(language_id, None) - if not state: - return - - # Cancel read task - task = self._read_tasks.pop(language_id, None) - if task: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - # Cancel stderr task - stderr_task = self._stderr_tasks.pop(language_id, None) - if stderr_task: - stderr_task.cancel() - try: - await stderr_task - except asyncio.CancelledError: - pass - - # Send shutdown request - try: - await self._send_request(state, "shutdown", None, timeout=5.0) - except Exception: - pass - - # Send exit notification - try: - await self._send_notification(state, "exit", None) - except Exception: - pass - - # Terminate process - if state.process.returncode is None: - state.process.terminate() - try: - await asyncio.wait_for(state.process.wait(), timeout=5.0) - except asyncio.TimeoutError: - state.process.kill() - - logger.info(f"Stopped {state.config.display_name}") - - async def _get_server(self, file_path: str) -> Optional[ServerState]: - """Get or start the appropriate language server for a file. - - Args: - file_path: Path to the file being operated on - - Returns: - ServerState for the appropriate language server, or None - """ - language_id = self.get_language_id(file_path) - if not language_id: - logger.debug(f"No language server configured for: {file_path}") - return None - - async with self._lock: - if language_id in self._servers: - state = self._servers[language_id] - # Check if process is still running - if state.process.returncode is None: - return state - # Process died, remove it - del self._servers[language_id] - - # Start new server - return await self._start_server(language_id) - - async def _initialize_server(self, state: ServerState) -> None: - """Send initialize request and wait for response via the message queue. - - The continuous reader and message processor are already running, so we just - send the request and wait for the response via pending_requests. - """ - root_uri = self.workspace_root.as_uri() - - # Simplified params matching direct test that works - params = { - "processId": None, # Use None like direct test - "rootUri": root_uri, - "rootPath": str(self.workspace_root), - "capabilities": { - "textDocument": { - "documentSymbol": { - "hierarchicalDocumentSymbolSupport": True, - }, - }, - "workspace": { - "configuration": True, - }, - }, - "workspaceFolders": [ - { - "uri": root_uri, - "name": self.workspace_root.name, - } - ], - } - - # Send initialize request and wait for response via queue - state.request_id += 1 - init_request_id = state.request_id - - # Create future for the response - future: asyncio.Future = asyncio.get_event_loop().create_future() - state.pending_requests[init_request_id] = future - - # Send the request - init_message = { - "jsonrpc": "2.0", - "id": init_request_id, - "method": "initialize", - "params": params, - } - encoded = self._encode_message(init_message) - logger.debug(f"Sending initialize request id={init_request_id}") - state.writer.write(encoded) - await state.writer.drain() - - # Wait for response (will be routed by _process_messages) - try: - init_result = await asyncio.wait_for(future, timeout=30.0) - except asyncio.TimeoutError: - state.pending_requests.pop(init_request_id, None) - raise RuntimeError("Initialize request timed out") - - if init_result is None: - init_result = {} - - # Store capabilities - state.capabilities = init_result.get("capabilities", {}) - state.initialized = True - logger.debug(f"Initialize response received, capabilities: {len(state.capabilities)} keys") - - # Send initialized notification - await self._send_notification(state, "initialized", {}) - - # Give time for server to process initialized and send any requests - # The message processor will handle workspace/configuration automatically - await asyncio.sleep(0.5) - - def _encode_message(self, content: Dict[str, Any]) -> bytes: - """Encode a JSON-RPC message with LSP headers.""" - body = json.dumps(content).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n" - return header.encode("ascii") + body - - async def _read_message(self, reader: asyncio.StreamReader) -> Tuple[Optional[Dict[str, Any]], bool]: - """Read a JSON-RPC message from the stream. - - Returns: - Tuple of (message, stream_closed). If stream_closed is True, the reader loop - should exit. If False and message is None, it was just a timeout. - """ - try: - # Read headers - content_length = 0 - while True: - try: - line = await asyncio.wait_for(reader.readline(), timeout=1.0) - except asyncio.TimeoutError: - # Timeout is not an error - just no message available yet - return None, False - - if not line: - # Empty read means stream closed - return None, True - - line_str = line.decode("ascii").strip() - if line_str: # Only log non-empty lines - logger.debug(f"Read header line: {repr(line_str[:80])}") - if not line_str: - break # Empty line = end of headers - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - return None, False - - # Read body - body = await reader.readexactly(content_length) - return json.loads(body.decode("utf-8")), False - - except asyncio.IncompleteReadError: - return None, True - except Exception as e: - logger.error(f"Error reading message: {e}") - return None, True - - async def _continuous_reader(self, language_id: str) -> None: - """Continuously read messages from language server and put them in the queue. - - This is the PRODUCER in the producer-consumer pattern. It starts IMMEDIATELY - after subprocess creation and runs continuously until shutdown. This ensures - no messages are ever lost, even during initialization handshake. - """ - state = self._servers.get(language_id) - if not state: - return - - logger.debug(f"Continuous reader started for {language_id}") - - try: - while True: - try: - # Read headers with timeout - content_length = 0 - while True: - try: - line = await asyncio.wait_for(state.reader.readline(), timeout=5.0) - except asyncio.TimeoutError: - continue # Keep waiting for data - - if not line: - logger.debug(f"Continuous reader for {language_id}: EOF") - return - - line_str = line.decode("ascii").strip() - if not line_str: - break # End of headers - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - continue - - # Read body - body = await state.reader.readexactly(content_length) - message = json.loads(body.decode("utf-8")) - - # Put message in queue for processing - await state.message_queue.put(message) - - msg_id = message.get("id", "none") - msg_method = message.get("method", "none") - logger.debug(f"Queued message: id={msg_id}, method={msg_method}") - - except asyncio.IncompleteReadError: - logger.debug(f"Continuous reader for {language_id}: IncompleteReadError") - return - except Exception as e: - logger.error(f"Error in continuous reader for {language_id}: {e}") - await asyncio.sleep(0.1) - - except asyncio.CancelledError: - logger.debug(f"Continuous reader cancelled for {language_id}") - except Exception as e: - logger.error(f"Fatal error in continuous reader for {language_id}: {e}") - - async def _process_messages(self, language_id: str) -> None: - """Process messages from the queue and route them appropriately. - - This is the CONSUMER in the producer-consumer pattern. It handles: - - Server requests (workspace/configuration, etc.) - responds immediately - - Notifications (window/logMessage, etc.) - logs them - - Responses to our requests are NOT handled here - they're consumed by _wait_for_response - """ - state = self._servers.get(language_id) - if not state: - return - - logger.debug(f"Message processor started for {language_id}") - - try: - while True: - # Get message from queue (blocks until available) - message = await state.message_queue.get() - - msg_id = message.get("id") - method = message.get("method", "") - - # Response (has id but no method) - put back for _wait_for_response to consume - if msg_id is not None and not method: - # This is a response to one of our requests - if msg_id in state.pending_requests: - future = state.pending_requests.pop(msg_id) - if "error" in message: - future.set_exception( - Exception(message["error"].get("message", "Unknown error")) - ) - else: - future.set_result(message.get("result")) - logger.debug(f"Response routed to pending request id={msg_id}") - else: - logger.debug(f"No pending request for response id={msg_id}") - - # Server request (has both id and method) - needs response - elif msg_id is not None and method: - logger.info(f"Server request: {method} (id={msg_id})") - await self._handle_server_request(state, message) - - # Notification (has method but no id) - elif method: - self._handle_server_message(language_id, message) - - state.message_queue.task_done() - - except asyncio.CancelledError: - logger.debug(f"Message processor cancelled for {language_id}") - except Exception as e: - logger.error(f"Error in message processor for {language_id}: {e}") - - async def _read_stderr(self, language_id: str, stderr: asyncio.StreamReader) -> None: - """Background task to read stderr from a language server. - - This prevents the stderr pipe buffer from filling up, which would - cause the language server process to block and stop responding. - """ - try: - while True: - line = await stderr.readline() - if not line: - break - text = line.decode("utf-8", errors="replace").rstrip() - if text: - # Log stderr output at warning level for visibility - logger.warning(f"[{language_id}] {text}") - except asyncio.CancelledError: - pass - except Exception as e: - logger.debug(f"Error reading stderr for {language_id}: {e}") - - def _handle_server_message(self, language_id: str, message: Dict[str, Any]) -> None: - """Handle notifications from the language server.""" - method = message.get("method", "") - params = message.get("params", {}) - - if method == "window/logMessage": - level = params.get("type", 4) # 1=error, 2=warn, 3=info, 4=log - text = params.get("message", "") - if level == 1: - logger.error(f"[{language_id}] {text}") - elif level == 2: - logger.warning(f"[{language_id}] {text}") - else: - logger.debug(f"[{language_id}] {text}") - - elif method == "window/showMessage": - text = params.get("message", "") - logger.info(f"[{language_id}] {text}") - - async def _handle_server_request(self, state: ServerState, message: Dict[str, Any]) -> None: - """Handle requests from the language server that need a response.""" - request_id = message["id"] - method = message.get("method", "") - params = message.get("params", {}) - - logger.info(f"SERVER REQUEST: {method} (id={request_id}) params={params}") - - result = None - - if method == "workspace/configuration": - # Return configuration items for each requested scope - items = params.get("items", []) - result = [] - for item in items: - section = item.get("section", "") - # Provide Python-specific settings for pyright - if section == "python": - result.append({ - "pythonPath": "python", - "analysis": { - "autoSearchPaths": True, - "useLibraryCodeForTypes": True, - "diagnosticMode": "workspace", - } - }) - elif section == "python.analysis": - result.append({ - "autoSearchPaths": True, - "useLibraryCodeForTypes": True, - "diagnosticMode": "workspace", - "typeCheckingMode": "basic", - }) - else: - # Return empty object for unknown sections - result.append({}) - sections = [item.get("section", "") for item in items] - logger.info(f"Responding to workspace/configuration with {len(result)} items for sections: {sections}") - - elif method == "client/registerCapability": - # Accept capability registration - result = None - - elif method == "window/workDoneProgress/create": - # Accept progress token creation - result = None - - else: - logger.debug(f"Unhandled server request: {method}") - - # Send response - response = { - "jsonrpc": "2.0", - "id": request_id, - "result": result, - } - try: - encoded = self._encode_message(response) - state.writer.write(encoded) - await state.writer.drain() - logger.debug(f"Sent response to server request {method} (id={request_id})") - except Exception as e: - logger.error(f"Failed to respond to server request {method}: {e}") - - async def _send_request( - self, - state: ServerState, - method: str, - params: Optional[Dict[str, Any]], - timeout: Optional[float] = None, - ) -> Any: - """Send a request to the language server and wait for response. - - Args: - state: Server state - method: LSP method name (e.g., "textDocument/definition") - params: Request parameters - timeout: Request timeout in seconds - - Returns: - Response result - """ - state.request_id += 1 - request_id = state.request_id - - message = { - "jsonrpc": "2.0", - "id": request_id, - "method": method, - "params": params or {}, - } - - future: asyncio.Future = asyncio.get_event_loop().create_future() - state.pending_requests[request_id] = future - - try: - encoded = self._encode_message(message) - logger.debug(f"Sending request id={request_id}, method={method}") - state.writer.write(encoded) - await state.writer.drain() - - return await asyncio.wait_for( - future, - timeout=timeout or self.timeout - ) - except asyncio.TimeoutError: - state.pending_requests.pop(request_id, None) - logger.warning(f"Request timed out: {method}") - return None - except Exception as e: - state.pending_requests.pop(request_id, None) - logger.error(f"Request failed: {method} - {e}") - return None - - async def _send_notification( - self, - state: ServerState, - method: str, - params: Optional[Dict[str, Any]], - ) -> None: - """Send a notification to the language server (no response expected).""" - message = { - "jsonrpc": "2.0", - "method": method, - "params": params or {}, - } - - try: - encoded = self._encode_message(message) - logger.debug(f"Sending notification: {method} ({len(encoded)} bytes)") - state.writer.write(encoded) - await state.writer.drain() - logger.debug(f"Notification sent: {method}") - except Exception as e: - logger.error(f"Failed to send notification {method}: {e}") - - def _to_text_document_identifier(self, file_path: str) -> Dict[str, str]: - """Create TextDocumentIdentifier from file path.""" - uri = Path(file_path).resolve().as_uri() - return {"uri": uri} - - def _to_position(self, line: int, character: int) -> Dict[str, int]: - """Create LSP Position (0-indexed) from 1-indexed line/character.""" - return { - "line": max(0, line - 1), # Convert 1-indexed to 0-indexed - "character": max(0, character - 1), - } - - async def _open_document(self, state: ServerState, file_path: str) -> None: - """Send textDocument/didOpen notification.""" - resolved_path = Path(file_path).resolve() - - try: - content = resolved_path.read_text(encoding="utf-8") - except Exception as e: - logger.error(f"Failed to read file {file_path}: {e}") - return - - # Detect language ID from extension - language_id = self.get_language_id(file_path) or "plaintext" - - logger.debug(f"Opening document: {resolved_path.name} ({len(content)} chars)") - await self._send_notification(state, "textDocument/didOpen", { - "textDocument": { - "uri": resolved_path.as_uri(), - "languageId": language_id, - "version": 1, - "text": content, - } - }) - - # Give the language server a brief moment to process the file - # The message queue handles any server requests automatically - await asyncio.sleep(0.5) - - # ========== Public LSP Methods ========== - - async def get_definition( - self, - file_path: str, - line: int, - character: int, - ) -> Optional[Dict[str, Any]]: - """Get definition location for symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - - Returns: - Location dict with uri, line, character, or None - """ - state = await self._get_server(file_path) - if not state: - return None - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/definition", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }) - - if not result: - return None - - # Handle single location or array - if isinstance(result, list): - if len(result) == 0: - return None - result = result[0] - - # Handle LocationLink vs Location - if "targetUri" in result: - # LocationLink format - return { - "uri": result["targetUri"], - "range": result.get("targetRange", result.get("targetSelectionRange", {})), - } - else: - # Location format - return result - - async def get_references( - self, - file_path: str, - line: int, - character: int, - include_declaration: bool = True, - ) -> List[Dict[str, Any]]: - """Get all references to symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - include_declaration: Whether to include the declaration - - Returns: - List of Location dicts with uri and range - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/references", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - "context": { - "includeDeclaration": include_declaration, - }, - }) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_hover( - self, - file_path: str, - line: int, - character: int, - ) -> Optional[str]: - """Get hover documentation for symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - - Returns: - Hover content as string, or None - """ - state = await self._get_server(file_path) - if not state: - return None - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/hover", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }) - - if not result: - return None - - contents = result.get("contents") - if not contents: - return None - - # Parse contents (can be string, MarkedString, MarkupContent, or array) - return self._parse_hover_contents(contents) - - def _parse_hover_contents(self, contents: Any) -> Optional[str]: - """Parse hover contents into string.""" - if isinstance(contents, str): - return contents - - if isinstance(contents, dict): - # MarkupContent or MarkedString - return contents.get("value", contents.get("contents", "")) - - if isinstance(contents, list): - parts = [] - for item in contents: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict): - parts.append(item.get("value", "")) - return "\n\n".join(p for p in parts if p) - - return None - - async def get_document_symbols( - self, - file_path: str, - ) -> List[Dict[str, Any]]: - """Get all symbols in a document. - - Args: - file_path: Path to the source file - - Returns: - List of DocumentSymbol or SymbolInformation dicts - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/documentSymbol", { - "textDocument": self._to_text_document_identifier(file_path), - }) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_call_hierarchy_items( - self, - file_path: str, - line: int, - character: int, - wait_for_analysis: float = 2.0, - ) -> List[Dict[str, Any]]: - """Prepare call hierarchy items for a position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - wait_for_analysis: Time to wait for server analysis (seconds) - - Returns: - List of CallHierarchyItem dicts - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Check if call hierarchy is supported - if not state.capabilities.get("callHierarchyProvider"): - return [] - - # Open document first - await self._open_document(state, file_path) - - # Wait for language server to complete analysis - # This is critical for Pyright to return valid call hierarchy items - if wait_for_analysis > 0: - await asyncio.sleep(wait_for_analysis) - - result = await self._send_request( - state, - "textDocument/prepareCallHierarchy", - { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_incoming_calls( - self, - item: Dict[str, Any], - ) -> List[Dict[str, Any]]: - """Get incoming calls for a call hierarchy item. - - Args: - item: CallHierarchyItem from get_call_hierarchy_items - - Returns: - List of CallHierarchyIncomingCall dicts - """ - # Determine language from item's uri - uri = item.get("uri", "") - file_path = uri.replace("file:///", "").replace("file://", "") - - state = await self._get_server(file_path) - if not state: - return [] - - result = await self._send_request( - state, - "callHierarchy/incomingCalls", - {"item": item}, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_outgoing_calls( - self, - item: Dict[str, Any], - ) -> List[Dict[str, Any]]: - """Get outgoing calls for a call hierarchy item. - - Args: - item: CallHierarchyItem from get_call_hierarchy_items - - Returns: - List of CallHierarchyOutgoingCall dicts - """ - # Determine language from item's uri - uri = item.get("uri", "") - file_path = uri.replace("file:///", "").replace("file://", "") - - state = await self._get_server(file_path) - if not state: - return [] - - result = await self._send_request( - state, - "callHierarchy/outgoingCalls", - {"item": item}, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def __aenter__(self) -> "StandaloneLspManager": - """Async context manager entry.""" - await self.start() - return self - - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Async context manager exit - stop all servers.""" - await self.stop() - - -# Simple test -if __name__ == "__main__": - async def test_standalone_manager(): - """Test StandaloneLspManager functionality.""" - print("Testing StandaloneLspManager...") - print() - - # Find a Python file to test with - test_file = Path(__file__).resolve() - print(f"Test file: {test_file}") - print() - - async with StandaloneLspManager( - workspace_root=str(test_file.parent.parent.parent.parent), # codex-lens root - timeout=30.0, - ) as manager: - print("1. Testing get_document_symbols...") - symbols = await manager.get_document_symbols(str(test_file)) - print(f" Found {len(symbols)} symbols") - for sym in symbols[:5]: - name = sym.get("name", "?") - kind = sym.get("kind", "?") - print(f" - {name} (kind={kind})") - print() - - print("2. Testing get_definition...") - # Test definition for 'asyncio' import (line 11) - definition = await manager.get_definition(str(test_file), 11, 8) - if definition: - print(f" Definition: {definition}") - else: - print(" No definition found") - print() - - print("3. Testing get_hover...") - hover = await manager.get_hover(str(test_file), 11, 8) - if hover: - print(f" Hover: {hover[:200]}...") - else: - print(" No hover info") - print() - - print("4. Testing get_references...") - refs = await manager.get_references(str(test_file), 50, 10) - print(f" Found {len(refs)} references") - for ref in refs[:3]: - print(f" - {ref}") - - print() - print("Test complete!") - - # Run the test - # Note: On Windows, use default ProactorEventLoop (supports subprocess creation) - - asyncio.run(test_standalone_manager()) diff --git a/codex-lens/build/lib/codexlens/mcp/__init__.py b/codex-lens/build/lib/codexlens/mcp/__init__.py deleted file mode 100644 index 5bb171c3..00000000 --- a/codex-lens/build/lib/codexlens/mcp/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Model Context Protocol implementation for Claude Code integration.""" - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) -from codexlens.mcp.provider import MCPProvider -from codexlens.mcp.hooks import HookManager, create_context_for_prompt - -__all__ = [ - "MCPContext", - "SymbolInfo", - "ReferenceInfo", - "RelatedSymbol", - "MCPProvider", - "HookManager", - "create_context_for_prompt", -] diff --git a/codex-lens/build/lib/codexlens/mcp/hooks.py b/codex-lens/build/lib/codexlens/mcp/hooks.py deleted file mode 100644 index ad6a2021..00000000 --- a/codex-lens/build/lib/codexlens/mcp/hooks.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Hook interfaces for Claude Code integration.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Any, Dict, Optional, Callable, TYPE_CHECKING - -from codexlens.mcp.schema import MCPContext - -if TYPE_CHECKING: - from codexlens.mcp.provider import MCPProvider - -logger = logging.getLogger(__name__) - - -class HookManager: - """Manages hook registration and execution.""" - - def __init__(self, mcp_provider: "MCPProvider") -> None: - self.mcp_provider = mcp_provider - self._pre_hooks: Dict[str, Callable] = {} - self._post_hooks: Dict[str, Callable] = {} - - # Register default hooks - self._register_default_hooks() - - def _register_default_hooks(self) -> None: - """Register built-in hooks.""" - self._pre_hooks["explain"] = self._pre_explain_hook - self._pre_hooks["refactor"] = self._pre_refactor_hook - self._pre_hooks["document"] = self._pre_document_hook - - def execute_pre_hook( - self, - action: str, - params: Dict[str, Any], - ) -> Optional[MCPContext]: - """Execute pre-tool hook to gather context. - - Args: - action: The action being performed (e.g., "explain", "refactor") - params: Parameters for the action - - Returns: - MCPContext to inject into prompt, or None - """ - hook = self._pre_hooks.get(action) - - if not hook: - logger.debug(f"No pre-hook for action: {action}") - return None - - try: - return hook(params) - except Exception as e: - logger.error(f"Pre-hook failed for {action}: {e}") - return None - - def execute_post_hook( - self, - action: str, - result: Any, - ) -> None: - """Execute post-tool hook for proactive caching. - - Args: - action: The action that was performed - result: Result of the action - """ - hook = self._post_hooks.get(action) - - if not hook: - return - - try: - hook(result) - except Exception as e: - logger.error(f"Post-hook failed for {action}: {e}") - - def _pre_explain_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'explain' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="symbol_explanation", - include_references=True, - include_related=True, - ) - - def _pre_refactor_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'refactor' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="refactor_context", - include_references=True, - include_related=True, - max_references=20, - ) - - def _pre_document_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'document' action.""" - symbol_name = params.get("symbol") - file_path = params.get("file_path") - - if symbol_name: - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="documentation_context", - include_references=False, - include_related=True, - ) - elif file_path: - return self.mcp_provider.build_context_for_file( - Path(file_path), - context_type="file_documentation", - ) - - return None - - def register_pre_hook( - self, - action: str, - hook: Callable[[Dict[str, Any]], Optional[MCPContext]], - ) -> None: - """Register a custom pre-tool hook.""" - self._pre_hooks[action] = hook - - def register_post_hook( - self, - action: str, - hook: Callable[[Any], None], - ) -> None: - """Register a custom post-tool hook.""" - self._post_hooks[action] = hook - - -def create_context_for_prompt( - mcp_provider: "MCPProvider", - action: str, - params: Dict[str, Any], -) -> str: - """Create context string for prompt injection. - - This is the main entry point for Claude Code hook integration. - - Args: - mcp_provider: The MCP provider instance - action: Action being performed - params: Action parameters - - Returns: - Formatted context string for prompt injection - """ - manager = HookManager(mcp_provider) - context = manager.execute_pre_hook(action, params) - - if context: - return context.to_prompt_injection() - - return "" diff --git a/codex-lens/build/lib/codexlens/mcp/provider.py b/codex-lens/build/lib/codexlens/mcp/provider.py deleted file mode 100644 index 97ebc055..00000000 --- a/codex-lens/build/lib/codexlens/mcp/provider.py +++ /dev/null @@ -1,202 +0,0 @@ -"""MCP context provider.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Optional, List, TYPE_CHECKING - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) - -if TYPE_CHECKING: - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.storage.registry import RegistryStore - from codexlens.search.chain_search import ChainSearchEngine - -logger = logging.getLogger(__name__) - - -class MCPProvider: - """Builds MCP context objects from codex-lens data.""" - - def __init__( - self, - global_index: "GlobalSymbolIndex", - search_engine: "ChainSearchEngine", - registry: "RegistryStore", - ) -> None: - self.global_index = global_index - self.search_engine = search_engine - self.registry = registry - - def build_context( - self, - symbol_name: str, - context_type: str = "symbol_explanation", - include_references: bool = True, - include_related: bool = True, - max_references: int = 10, - ) -> Optional[MCPContext]: - """Build comprehensive context for a symbol. - - Args: - symbol_name: Name of the symbol to contextualize - context_type: Type of context being requested - include_references: Whether to include reference locations - include_related: Whether to include related symbols - max_references: Maximum number of references to include - - Returns: - MCPContext object or None if symbol not found - """ - # Look up symbol - symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1) - - if not symbols: - logger.debug(f"Symbol not found for MCP context: {symbol_name}") - return None - - symbol = symbols[0] - - # Build SymbolInfo - symbol_info = SymbolInfo( - name=symbol.name, - kind=symbol.kind, - file_path=symbol.file or "", - line_start=symbol.range[0], - line_end=symbol.range[1], - signature=None, # Symbol entity doesn't have signature - documentation=None, # Symbol entity doesn't have docstring - ) - - # Extract definition source code - definition = self._extract_definition(symbol) - - # Get references - references = [] - if include_references: - refs = self.search_engine.search_references( - symbol_name, - limit=max_references, - ) - references = [ - ReferenceInfo( - file_path=r.file_path, - line=r.line, - column=r.column, - context=r.context, - relationship_type=r.relationship_type, - ) - for r in refs - ] - - # Get related symbols - related_symbols = [] - if include_related: - related_symbols = self._get_related_symbols(symbol) - - return MCPContext( - context_type=context_type, - symbol=symbol_info, - definition=definition, - references=references, - related_symbols=related_symbols, - metadata={ - "source": "codex-lens", - }, - ) - - def _extract_definition(self, symbol) -> Optional[str]: - """Extract source code for symbol definition.""" - try: - file_path = Path(symbol.file) if symbol.file else None - if not file_path or not file_path.exists(): - return None - - content = file_path.read_text(encoding='utf-8', errors='ignore') - lines = content.split("\n") - - start = symbol.range[0] - 1 - end = symbol.range[1] - - if start >= len(lines): - return None - - return "\n".join(lines[start:end]) - except Exception as e: - logger.debug(f"Failed to extract definition: {e}") - return None - - def _get_related_symbols(self, symbol) -> List[RelatedSymbol]: - """Get symbols related to the given symbol.""" - related = [] - - try: - # Search for symbols that might be related by name patterns - # This is a simplified implementation - could be enhanced with relationship data - - # Look for imports/callers via reference search - refs = self.search_engine.search_references(symbol.name, limit=20) - - seen_names = set() - for ref in refs: - # Extract potential symbol name from context - if ref.relationship_type and ref.relationship_type not in seen_names: - related.append(RelatedSymbol( - name=f"{Path(ref.file_path).stem}", - kind="module", - relationship=ref.relationship_type, - file_path=ref.file_path, - )) - seen_names.add(ref.relationship_type) - if len(related) >= 10: - break - - except Exception as e: - logger.debug(f"Failed to get related symbols: {e}") - - return related - - def build_context_for_file( - self, - file_path: Path, - context_type: str = "file_overview", - ) -> MCPContext: - """Build context for an entire file.""" - # Try to get symbols by searching with file path - # Note: GlobalSymbolIndex doesn't have search_by_file, so we use a different approach - symbols = [] - - # Search for common symbols that might be in this file - # This is a simplified approach - a full implementation would query by file path - try: - # Use the global index to search for symbols from this file - file_str = str(file_path.resolve()) - # Get all symbols and filter by file path (not efficient but works) - all_symbols = self.global_index.search("", prefix_mode=True, limit=1000) - symbols = [s for s in all_symbols if s.file and str(Path(s.file).resolve()) == file_str] - except Exception as e: - logger.debug(f"Failed to get file symbols: {e}") - - related = [ - RelatedSymbol( - name=s.name, - kind=s.kind, - relationship="defines", - ) - for s in symbols - ] - - return MCPContext( - context_type=context_type, - related_symbols=related, - metadata={ - "file_path": str(file_path), - "symbol_count": len(symbols), - }, - ) diff --git a/codex-lens/build/lib/codexlens/mcp/schema.py b/codex-lens/build/lib/codexlens/mcp/schema.py deleted file mode 100644 index 1062e626..00000000 --- a/codex-lens/build/lib/codexlens/mcp/schema.py +++ /dev/null @@ -1,113 +0,0 @@ -"""MCP data models.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass, field, asdict -from typing import List, Optional - - -@dataclass -class SymbolInfo: - """Information about a code symbol.""" - name: str - kind: str - file_path: str - line_start: int - line_end: int - signature: Optional[str] = None - documentation: Optional[str] = None - - def to_dict(self) -> dict: - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class ReferenceInfo: - """Information about a symbol reference.""" - file_path: str - line: int - column: int - context: str - relationship_type: str - - def to_dict(self) -> dict: - return asdict(self) - - -@dataclass -class RelatedSymbol: - """Related symbol (import, call target, etc.).""" - name: str - kind: str - relationship: str # "imports", "calls", "inherits", "uses" - file_path: Optional[str] = None - - def to_dict(self) -> dict: - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class MCPContext: - """Model Context Protocol context object. - - This is the structured context that gets injected into - LLM prompts to provide code understanding. - """ - version: str = "1.0" - context_type: str = "code_context" - symbol: Optional[SymbolInfo] = None - definition: Optional[str] = None - references: List[ReferenceInfo] = field(default_factory=list) - related_symbols: List[RelatedSymbol] = field(default_factory=list) - metadata: dict = field(default_factory=dict) - - def to_dict(self) -> dict: - """Convert to dictionary for JSON serialization.""" - result = { - "version": self.version, - "context_type": self.context_type, - "metadata": self.metadata, - } - - if self.symbol: - result["symbol"] = self.symbol.to_dict() - if self.definition: - result["definition"] = self.definition - if self.references: - result["references"] = [r.to_dict() for r in self.references] - if self.related_symbols: - result["related_symbols"] = [s.to_dict() for s in self.related_symbols] - - return result - - def to_json(self, indent: int = 2) -> str: - """Serialize to JSON string.""" - return json.dumps(self.to_dict(), indent=indent) - - def to_prompt_injection(self) -> str: - """Format for injection into LLM prompt.""" - parts = [""] - - if self.symbol: - parts.append(f"## Symbol: {self.symbol.name}") - parts.append(f"Type: {self.symbol.kind}") - parts.append(f"Location: {self.symbol.file_path}:{self.symbol.line_start}") - - if self.definition: - parts.append("\n## Definition") - parts.append(f"```\n{self.definition}\n```") - - if self.references: - parts.append(f"\n## References ({len(self.references)} found)") - for ref in self.references[:5]: # Limit to 5 - parts.append(f"- {ref.file_path}:{ref.line} ({ref.relationship_type})") - parts.append(f" ```\n {ref.context}\n ```") - - if self.related_symbols: - parts.append("\n## Related Symbols") - for sym in self.related_symbols[:10]: # Limit to 10 - parts.append(f"- {sym.name} ({sym.relationship})") - - parts.append("") - return "\n".join(parts) diff --git a/codex-lens/build/lib/codexlens/parsers/__init__.py b/codex-lens/build/lib/codexlens/parsers/__init__.py deleted file mode 100644 index f2ecfe78..00000000 --- a/codex-lens/build/lib/codexlens/parsers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Parsers for CodexLens.""" - -from __future__ import annotations - -from .factory import ParserFactory - -__all__ = ["ParserFactory"] - diff --git a/codex-lens/build/lib/codexlens/parsers/encoding.py b/codex-lens/build/lib/codexlens/parsers/encoding.py deleted file mode 100644 index b796d24b..00000000 --- a/codex-lens/build/lib/codexlens/parsers/encoding.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Optional encoding detection module for CodexLens. - -Provides automatic encoding detection with graceful fallback to UTF-8. -Install with: pip install codexlens[encoding] -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Tuple, Optional - -log = logging.getLogger(__name__) - -# Feature flag for encoding detection availability -ENCODING_DETECTION_AVAILABLE = False -_import_error: Optional[str] = None - - -def _detect_chardet_backend() -> Tuple[bool, Optional[str]]: - """Detect if chardet or charset-normalizer is available.""" - try: - import chardet - return True, None - except ImportError: - pass - - try: - from charset_normalizer import from_bytes - return True, None - except ImportError: - pass - - return False, "chardet not available. Install with: pip install codexlens[encoding]" - - -# Initialize on module load -ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend() - - -def check_encoding_available() -> Tuple[bool, Optional[str]]: - """Check if encoding detection dependencies are available. - - Returns: - Tuple of (available, error_message) - """ - return ENCODING_DETECTION_AVAILABLE, _import_error - - -def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str: - """Detect encoding from file content bytes. - - Uses chardet or charset-normalizer with configurable confidence threshold. - Falls back to UTF-8 if confidence is too low or detection unavailable. - - Args: - content_bytes: Raw file content as bytes - confidence_threshold: Minimum confidence (0.0-1.0) to accept detection - - Returns: - Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk') - Returns 'utf-8' as fallback if detection fails or confidence too low - """ - if not ENCODING_DETECTION_AVAILABLE: - log.debug("Encoding detection not available, using UTF-8 fallback") - return "utf-8" - - if not content_bytes: - return "utf-8" - - try: - # Try chardet first - try: - import chardet - result = chardet.detect(content_bytes) - encoding = result.get("encoding") - confidence = result.get("confidence", 0.0) - - if encoding and confidence >= confidence_threshold: - log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})") - # Normalize encoding name: replace underscores with hyphens - return encoding.lower().replace('_', '-') - else: - log.debug( - f"Low confidence encoding detection: {encoding} " - f"(confidence: {confidence:.2f}), using UTF-8 fallback" - ) - return "utf-8" - except ImportError: - pass - - # Fallback to charset-normalizer - try: - from charset_normalizer import from_bytes - results = from_bytes(content_bytes) - if results: - best = results.best() - if best and best.encoding: - log.debug(f"Detected encoding via charset-normalizer: {best.encoding}") - # Normalize encoding name: replace underscores with hyphens - return best.encoding.lower().replace('_', '-') - except ImportError: - pass - - except Exception as e: - log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback") - - return "utf-8" - - -def read_file_safe( - path: Path | str, - confidence_threshold: float = 0.7, - max_detection_bytes: int = 100_000 -) -> Tuple[str, str]: - """Read file with automatic encoding detection and safe decoding. - - Reads file bytes, detects encoding, and decodes with error replacement - to preserve file structure even with encoding issues. - - Args: - path: Path to file to read - confidence_threshold: Minimum confidence for encoding detection - max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB) - - Returns: - Tuple of (content, detected_encoding) - - content: Decoded file content (with � for unmappable bytes) - - detected_encoding: Detected encoding name - - Raises: - OSError: If file cannot be read - IsADirectoryError: If path is a directory - """ - file_path = Path(path) if isinstance(path, str) else path - - # Read file bytes - try: - content_bytes = file_path.read_bytes() - except Exception as e: - log.error(f"Failed to read file {file_path}: {e}") - raise - - # Detect encoding from first N bytes for performance - detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes - encoding = detect_encoding(detection_sample, confidence_threshold) - - # Decode with error replacement to preserve structure - try: - content = content_bytes.decode(encoding, errors='replace') - log.debug(f"Successfully decoded {file_path} using {encoding}") - return content, encoding - except Exception as e: - # Final fallback to UTF-8 with replacement - log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}") - content = content_bytes.decode('utf-8', errors='replace') - return content, 'utf-8' - - -def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool: - """Check if file is likely binary by sampling first bytes. - - Uses heuristic: if >30% of sample bytes are null or non-text, consider binary. - - Args: - path: Path to file to check - sample_size: Number of bytes to sample (default 8KB) - - Returns: - True if file appears to be binary, False otherwise - """ - file_path = Path(path) if isinstance(path, str) else path - - try: - with file_path.open('rb') as f: - sample = f.read(sample_size) - - if not sample: - return False - - # Count null bytes and non-printable characters - null_count = sample.count(b'\x00') - non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d)) - - # If >30% null bytes or >50% non-text, consider binary - null_ratio = null_count / len(sample) - non_text_ratio = non_text_count / len(sample) - - return null_ratio > 0.3 or non_text_ratio > 0.5 - - except Exception as e: - log.debug(f"Binary check failed for {file_path}: {e}, assuming text") - return False - - -__all__ = [ - "ENCODING_DETECTION_AVAILABLE", - "check_encoding_available", - "detect_encoding", - "read_file_safe", - "is_binary_file", -] diff --git a/codex-lens/build/lib/codexlens/parsers/factory.py b/codex-lens/build/lib/codexlens/parsers/factory.py deleted file mode 100644 index 0f8f4f14..00000000 --- a/codex-lens/build/lib/codexlens/parsers/factory.py +++ /dev/null @@ -1,385 +0,0 @@ -"""Parser factory for CodexLens. - -Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when -available. Regex fallbacks are retained to preserve the existing parser -interface and behavior in minimal environments. -""" - -from __future__ import annotations - -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional, Protocol - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser - - -class Parser(Protocol): - def parse(self, text: str, path: Path) -> IndexedFile: ... - - -@dataclass -class SimpleRegexParser: - language_id: str - - def parse(self, text: str, path: Path) -> IndexedFile: - # Try tree-sitter first for supported languages - if self.language_id in {"python", "javascript", "typescript"}: - ts_parser = TreeSitterSymbolParser(self.language_id, path) - if ts_parser.is_available(): - indexed = ts_parser.parse(text, path) - if indexed is not None: - return indexed - - # Fallback to regex parsing - if self.language_id == "python": - symbols = _parse_python_symbols_regex(text) - relationships = _parse_python_relationships_regex(text, path) - elif self.language_id in {"javascript", "typescript"}: - symbols = _parse_js_ts_symbols_regex(text) - relationships = _parse_js_ts_relationships_regex(text, path) - elif self.language_id == "java": - symbols = _parse_java_symbols(text) - relationships = [] - elif self.language_id == "go": - symbols = _parse_go_symbols(text) - relationships = [] - elif self.language_id == "markdown": - symbols = _parse_markdown_symbols(text) - relationships = [] - elif self.language_id == "text": - symbols = _parse_text_symbols(text) - relationships = [] - else: - symbols = _parse_generic_symbols(text) - relationships = [] - - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=symbols, - chunks=[], - relationships=relationships, - ) - - -class ParserFactory: - def __init__(self, config: Config) -> None: - self.config = config - self._parsers: Dict[str, Parser] = {} - - def get_parser(self, language_id: str) -> Parser: - if language_id not in self._parsers: - self._parsers[language_id] = SimpleRegexParser(language_id) - return self._parsers[language_id] - - -# Regex-based fallback parsers -_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b") -_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(") - -_PY_IMPORT_RE = re.compile(r"^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)") -_PY_CALL_RE = re.compile(r"(? List[Symbol]: - """Parse Python symbols, using tree-sitter if available, regex fallback.""" - ts_parser = TreeSitterSymbolParser("python") - if ts_parser.is_available(): - symbols = ts_parser.parse_symbols(text) - if symbols is not None: - return symbols - return _parse_python_symbols_regex(text) - - -def _parse_js_ts_symbols( - text: str, - language_id: str = "javascript", - path: Optional[Path] = None, -) -> List[Symbol]: - """Parse JS/TS symbols, using tree-sitter if available, regex fallback.""" - ts_parser = TreeSitterSymbolParser(language_id, path) - if ts_parser.is_available(): - symbols = ts_parser.parse_symbols(text) - if symbols is not None: - return symbols - return _parse_js_ts_symbols_regex(text) - - -def _parse_python_symbols_regex(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - current_class_indent: Optional[int] = None - for i, line in enumerate(text.splitlines(), start=1): - class_match = _PY_CLASS_RE.match(line) - if class_match: - current_class_indent = len(line) - len(line.lstrip(" ")) - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - def_match = _PY_DEF_RE.match(line) - if def_match: - indent = len(line) - len(line.lstrip(" ")) - kind = "method" if current_class_indent is not None and indent > current_class_indent else "function" - symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i))) - continue - if current_class_indent is not None: - indent = len(line) - len(line.lstrip(" ")) - if line.strip() and indent <= current_class_indent: - current_class_indent = None - return symbols - - -def _parse_python_relationships_regex(text: str, path: Path) -> List[CodeRelationship]: - relationships: List[CodeRelationship] = [] - current_scope: str | None = None - source_file = str(path.resolve()) - - for line_num, line in enumerate(text.splitlines(), start=1): - class_match = _PY_CLASS_RE.match(line) - if class_match: - current_scope = class_match.group(1) - continue - - def_match = _PY_DEF_RE.match(line) - if def_match: - current_scope = def_match.group(1) - continue - - if current_scope is None: - continue - - import_match = _PY_IMPORT_RE.search(line) - if import_match: - import_target = import_match.group(1) or import_match.group(2) - if import_target: - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=import_target.strip(), - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - for call_match in _PY_CALL_RE.finditer(line): - call_name = call_match.group(1) - if call_name in { - "if", - "for", - "while", - "return", - "print", - "len", - "str", - "int", - "float", - "list", - "dict", - "set", - "tuple", - current_scope, - }: - continue - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=call_name, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - return relationships - - -_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(") -_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b") -_JS_ARROW_RE = re.compile( - r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>" -) -_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{") -_JS_IMPORT_RE = re.compile(r"import\s+.*\s+from\s+['\"]([^'\"]+)['\"]") -_JS_CALL_RE = re.compile(r"(? List[Symbol]: - symbols: List[Symbol] = [] - in_class = False - class_brace_depth = 0 - brace_depth = 0 - - for i, line in enumerate(text.splitlines(), start=1): - brace_depth += line.count("{") - line.count("}") - - class_match = _JS_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - in_class = True - class_brace_depth = brace_depth - continue - - if in_class and brace_depth < class_brace_depth: - in_class = False - - func_match = _JS_FUNC_RE.match(line) - if func_match: - symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i))) - continue - - arrow_match = _JS_ARROW_RE.match(line) - if arrow_match: - symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i))) - continue - - if in_class: - method_match = _JS_METHOD_RE.match(line) - if method_match: - name = method_match.group(1) - if name != "constructor": - symbols.append(Symbol(name=name, kind="method", range=(i, i))) - - return symbols - - -def _parse_js_ts_relationships_regex(text: str, path: Path) -> List[CodeRelationship]: - relationships: List[CodeRelationship] = [] - current_scope: str | None = None - source_file = str(path.resolve()) - - for line_num, line in enumerate(text.splitlines(), start=1): - class_match = _JS_CLASS_RE.match(line) - if class_match: - current_scope = class_match.group(1) - continue - - func_match = _JS_FUNC_RE.match(line) - if func_match: - current_scope = func_match.group(1) - continue - - arrow_match = _JS_ARROW_RE.match(line) - if arrow_match: - current_scope = arrow_match.group(1) - continue - - if current_scope is None: - continue - - import_match = _JS_IMPORT_RE.search(line) - if import_match: - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=import_match.group(1), - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - for call_match in _JS_CALL_RE.finditer(line): - call_name = call_match.group(1) - if call_name in {current_scope}: - continue - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=call_name, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - return relationships - - -_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b") -_JAVA_METHOD_RE = re.compile( - r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\(" -) - - -def _parse_java_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - class_match = _JAVA_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - method_match = _JAVA_METHOD_RE.match(line) - if method_match: - symbols.append(Symbol(name=method_match.group(1), kind="method", range=(i, i))) - return symbols - - -_GO_FUNC_RE = re.compile(r"^\s*func\s+(?:\([^)]+\)\s+)?([A-Za-z_]\w*)\s*\(") -_GO_TYPE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+(?:struct|interface)\b") - - -def _parse_go_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - type_match = _GO_TYPE_RE.match(line) - if type_match: - symbols.append(Symbol(name=type_match.group(1), kind="class", range=(i, i))) - continue - func_match = _GO_FUNC_RE.match(line) - if func_match: - symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i))) - return symbols - - -_GENERIC_DEF_RE = re.compile(r"^\s*(?:def|function|func)\s+([A-Za-z_]\w*)\b") -_GENERIC_CLASS_RE = re.compile(r"^\s*(?:class|struct|interface)\s+([A-Za-z_]\w*)\b") - - -def _parse_generic_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - class_match = _GENERIC_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - def_match = _GENERIC_DEF_RE.match(line) - if def_match: - symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i))) - return symbols - - -# Markdown heading regex: # Heading, ## Heading, etc. -_MD_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$") - - -def _parse_markdown_symbols(text: str) -> List[Symbol]: - """Parse Markdown headings as symbols. - - Extracts # headings as 'section' symbols with heading level as kind suffix. - """ - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - heading_match = _MD_HEADING_RE.match(line) - if heading_match: - level = len(heading_match.group(1)) - title = heading_match.group(2).strip() - # Use 'section' kind with level indicator - kind = f"h{level}" - symbols.append(Symbol(name=title, kind=kind, range=(i, i))) - return symbols - - -def _parse_text_symbols(text: str) -> List[Symbol]: - """Parse plain text files - no symbols, just index content.""" - # Text files don't have structured symbols, return empty list - # The file content will still be indexed for FTS search - return [] diff --git a/codex-lens/build/lib/codexlens/parsers/tokenizer.py b/codex-lens/build/lib/codexlens/parsers/tokenizer.py deleted file mode 100644 index dcb12238..00000000 --- a/codex-lens/build/lib/codexlens/parsers/tokenizer.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Token counting utilities for CodexLens. - -Provides accurate token counting using tiktoken with character count fallback. -""" - -from __future__ import annotations - -from typing import Optional - -try: - import tiktoken - TIKTOKEN_AVAILABLE = True -except ImportError: - TIKTOKEN_AVAILABLE = False - - -class Tokenizer: - """Token counter with tiktoken primary and character count fallback.""" - - def __init__(self, encoding_name: str = "cl100k_base") -> None: - """Initialize tokenizer. - - Args: - encoding_name: Tiktoken encoding name (default: cl100k_base for GPT-4) - """ - self._encoding: Optional[object] = None - self._encoding_name = encoding_name - - if TIKTOKEN_AVAILABLE: - try: - self._encoding = tiktoken.get_encoding(encoding_name) - except Exception: - # Fallback to character counting if encoding fails - self._encoding = None - - def count_tokens(self, text: str) -> int: - """Count tokens in text. - - Uses tiktoken if available, otherwise falls back to character count / 4. - - Args: - text: Text to count tokens for - - Returns: - Estimated token count - """ - if not text: - return 0 - - if self._encoding is not None: - try: - return len(self._encoding.encode(text)) # type: ignore[attr-defined] - except Exception: - # Fall through to character count fallback - pass - - # Fallback: rough estimate using character count - # Average of ~4 characters per token for English text - return max(1, len(text) // 4) - - def is_using_tiktoken(self) -> bool: - """Check if tiktoken is being used. - - Returns: - True if tiktoken is available and initialized - """ - return self._encoding is not None - - -# Global default tokenizer instance -_default_tokenizer: Optional[Tokenizer] = None - - -def get_default_tokenizer() -> Tokenizer: - """Get the global default tokenizer instance. - - Returns: - Shared Tokenizer instance - """ - global _default_tokenizer - if _default_tokenizer is None: - _default_tokenizer = Tokenizer() - return _default_tokenizer - - -def count_tokens(text: str, tokenizer: Optional[Tokenizer] = None) -> int: - """Count tokens in text using default or provided tokenizer. - - Args: - text: Text to count tokens for - tokenizer: Optional tokenizer instance (uses default if None) - - Returns: - Estimated token count - """ - if tokenizer is None: - tokenizer = get_default_tokenizer() - return tokenizer.count_tokens(text) diff --git a/codex-lens/build/lib/codexlens/parsers/treesitter_parser.py b/codex-lens/build/lib/codexlens/parsers/treesitter_parser.py deleted file mode 100644 index 4ae44cae..00000000 --- a/codex-lens/build/lib/codexlens/parsers/treesitter_parser.py +++ /dev/null @@ -1,809 +0,0 @@ -"""Tree-sitter based parser for CodexLens. - -Provides precise AST-level parsing via tree-sitter. - -Note: This module does not provide a regex fallback inside `TreeSitterSymbolParser`. -If tree-sitter (or a language binding) is unavailable, `parse()`/`parse_symbols()` -return `None`; callers should use a regex-based fallback such as -`codexlens.parsers.factory.SimpleRegexParser`. -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Dict, List, Optional - -try: - from tree_sitter import Language as TreeSitterLanguage - from tree_sitter import Node as TreeSitterNode - from tree_sitter import Parser as TreeSitterParser - TREE_SITTER_AVAILABLE = True -except ImportError: - TreeSitterLanguage = None # type: ignore[assignment] - TreeSitterNode = None # type: ignore[assignment] - TreeSitterParser = None # type: ignore[assignment] - TREE_SITTER_AVAILABLE = False - -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol -from codexlens.parsers.tokenizer import get_default_tokenizer - - -class TreeSitterSymbolParser: - """Parser using tree-sitter for AST-level symbol extraction.""" - - def __init__(self, language_id: str, path: Optional[Path] = None) -> None: - """Initialize tree-sitter parser for a language. - - Args: - language_id: Language identifier (python, javascript, typescript, etc.) - path: Optional file path for language variant detection (e.g., .tsx) - """ - self.language_id = language_id - self.path = path - self._parser: Optional[object] = None - self._language: Optional[TreeSitterLanguage] = None - self._tokenizer = get_default_tokenizer() - - if TREE_SITTER_AVAILABLE: - self._initialize_parser() - - def _initialize_parser(self) -> None: - """Initialize tree-sitter parser and language.""" - if TreeSitterParser is None or TreeSitterLanguage is None: - return - - try: - # Load language grammar - if self.language_id == "python": - import tree_sitter_python - self._language = TreeSitterLanguage(tree_sitter_python.language()) - elif self.language_id == "javascript": - import tree_sitter_javascript - self._language = TreeSitterLanguage(tree_sitter_javascript.language()) - elif self.language_id == "typescript": - import tree_sitter_typescript - # Detect TSX files by extension - if self.path is not None and self.path.suffix.lower() == ".tsx": - self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx()) - else: - self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript()) - else: - return - - # Create parser - self._parser = TreeSitterParser() - if hasattr(self._parser, "set_language"): - self._parser.set_language(self._language) # type: ignore[attr-defined] - else: - self._parser.language = self._language # type: ignore[assignment] - - except Exception: - # Gracefully handle missing language bindings - self._parser = None - self._language = None - - def is_available(self) -> bool: - """Check if tree-sitter parser is available. - - Returns: - True if parser is initialized and ready - """ - return self._parser is not None and self._language is not None - - def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]: - if not self.is_available() or self._parser is None: - return None - - try: - source_bytes = text.encode("utf8") - tree = self._parser.parse(source_bytes) # type: ignore[attr-defined] - return source_bytes, tree.root_node - except Exception: - return None - - def parse_symbols(self, text: str) -> Optional[List[Symbol]]: - """Parse source code and extract symbols without creating IndexedFile. - - Args: - text: Source code text - - Returns: - List of symbols if parsing succeeds, None if tree-sitter unavailable - """ - parsed = self._parse_tree(text) - if parsed is None: - return None - - source_bytes, root = parsed - try: - return self._extract_symbols(source_bytes, root) - except Exception: - # Gracefully handle extraction errors - return None - - def parse(self, text: str, path: Path) -> Optional[IndexedFile]: - """Parse source code and extract symbols. - - Args: - text: Source code text - path: File path - - Returns: - IndexedFile if parsing succeeds, None if tree-sitter unavailable - """ - parsed = self._parse_tree(text) - if parsed is None: - return None - - source_bytes, root = parsed - try: - symbols = self._extract_symbols(source_bytes, root) - relationships = self._extract_relationships(source_bytes, root, path) - - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=symbols, - chunks=[], - relationships=relationships, - ) - except Exception: - # Gracefully handle parsing errors - return None - - def _extract_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of extracted symbols - """ - if self.language_id == "python": - return self._extract_python_symbols(source_bytes, root) - elif self.language_id in {"javascript", "typescript"}: - return self._extract_js_ts_symbols(source_bytes, root) - else: - return [] - - def _extract_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - ) -> List[CodeRelationship]: - if self.language_id == "python": - return self._extract_python_relationships(source_bytes, root, path) - if self.language_id in {"javascript", "typescript"}: - return self._extract_js_ts_relationships(source_bytes, root, path) - return [] - - def _extract_python_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - ) -> List[CodeRelationship]: - source_file = str(path.resolve()) - relationships: List[CodeRelationship] = [] - - scope_stack: List[str] = [] - alias_stack: List[Dict[str, str]] = [{}] - - def record_import(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_call(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - base = target_symbol.split(".", 1)[0] - if base in {"self", "cls"}: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_inherits(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def visit(node: TreeSitterNode) -> None: - pushed_scope = False - pushed_aliases = False - - if node.type in {"class_definition", "function_definition", "async_function_definition"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type == "class_definition" and pushed_scope: - superclasses = node.child_by_field_name("superclasses") - if superclasses is not None: - for child in superclasses.children: - dotted = self._python_expression_to_dotted(source_bytes, child) - if not dotted: - continue - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_inherits(resolved, self._node_start_line(node)) - - if node.type in {"import_statement", "import_from_statement"}: - updates, imported_targets = self._python_import_aliases_and_targets(source_bytes, node) - if updates: - alias_stack[-1].update(updates) - for target_symbol in imported_targets: - record_import(target_symbol, self._node_start_line(node)) - - if node.type == "call": - fn_node = node.child_by_field_name("function") - if fn_node is not None: - dotted = self._python_expression_to_dotted(source_bytes, fn_node) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_call(resolved, self._node_start_line(node)) - - for child in node.children: - visit(child) - - if pushed_aliases: - alias_stack.pop() - if pushed_scope: - scope_stack.pop() - - visit(root) - return relationships - - def _extract_js_ts_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - ) -> List[CodeRelationship]: - source_file = str(path.resolve()) - relationships: List[CodeRelationship] = [] - - scope_stack: List[str] = [] - alias_stack: List[Dict[str, str]] = [{}] - - def record_import(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_call(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - base = target_symbol.split(".", 1)[0] - if base in {"this", "super"}: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_inherits(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def visit(node: TreeSitterNode) -> None: - pushed_scope = False - pushed_aliases = False - - if node.type in {"function_declaration", "generator_function_declaration"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type in {"class_declaration", "class"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if pushed_scope: - superclass = node.child_by_field_name("superclass") - if superclass is not None: - dotted = self._js_expression_to_dotted(source_bytes, superclass) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_inherits(resolved, self._node_start_line(node)) - - if node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is not None - and value_node is not None - and name_node.type in {"identifier", "property_identifier"} - and value_node.type == "arrow_function" - ): - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type == "method_definition" and self._has_class_ancestor(node): - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name and scope_name != "constructor": - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type in {"import_declaration", "import_statement"}: - updates, imported_targets = self._js_import_aliases_and_targets(source_bytes, node) - if updates: - alias_stack[-1].update(updates) - for target_symbol in imported_targets: - record_import(target_symbol, self._node_start_line(node)) - - # Best-effort support for CommonJS require() imports: - # const fs = require("fs") - if node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is not None - and value_node is not None - and name_node.type == "identifier" - and value_node.type == "call_expression" - ): - callee = value_node.child_by_field_name("function") - args = value_node.child_by_field_name("arguments") - if ( - callee is not None - and self._node_text(source_bytes, callee).strip() == "require" - and args is not None - ): - module_name = self._js_first_string_argument(source_bytes, args) - if module_name: - alias_stack[-1][self._node_text(source_bytes, name_node).strip()] = module_name - record_import(module_name, self._node_start_line(node)) - - if node.type == "call_expression": - fn_node = node.child_by_field_name("function") - if fn_node is not None: - dotted = self._js_expression_to_dotted(source_bytes, fn_node) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_call(resolved, self._node_start_line(node)) - - for child in node.children: - visit(child) - - if pushed_aliases: - alias_stack.pop() - if pushed_scope: - scope_stack.pop() - - visit(root) - return relationships - - def _node_start_line(self, node: TreeSitterNode) -> int: - return node.start_point[0] + 1 - - def _resolve_alias_dotted(self, dotted: str, aliases: Dict[str, str]) -> str: - dotted = (dotted or "").strip() - if not dotted: - return "" - - base, sep, rest = dotted.partition(".") - resolved_base = aliases.get(base, base) - if not rest: - return resolved_base - if resolved_base and rest: - return f"{resolved_base}.{rest}" - return resolved_base - - def _python_expression_to_dotted(self, source_bytes: bytes, node: TreeSitterNode) -> str: - if node.type in {"identifier", "dotted_name"}: - return self._node_text(source_bytes, node).strip() - if node.type == "attribute": - obj = node.child_by_field_name("object") - attr = node.child_by_field_name("attribute") - obj_text = self._python_expression_to_dotted(source_bytes, obj) if obj is not None else "" - attr_text = self._node_text(source_bytes, attr).strip() if attr is not None else "" - if obj_text and attr_text: - return f"{obj_text}.{attr_text}" - return obj_text or attr_text - return "" - - def _python_import_aliases_and_targets( - self, - source_bytes: bytes, - node: TreeSitterNode, - ) -> tuple[Dict[str, str], List[str]]: - aliases: Dict[str, str] = {} - targets: List[str] = [] - - if node.type == "import_statement": - for child in node.children: - if child.type == "aliased_import": - name_node = child.child_by_field_name("name") - alias_node = child.child_by_field_name("alias") - if name_node is None: - continue - module_name = self._node_text(source_bytes, name_node).strip() - if not module_name: - continue - bound_name = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else module_name.split(".", 1)[0] - ) - if bound_name: - aliases[bound_name] = module_name - targets.append(module_name) - elif child.type == "dotted_name": - module_name = self._node_text(source_bytes, child).strip() - if not module_name: - continue - bound_name = module_name.split(".", 1)[0] - if bound_name: - aliases[bound_name] = bound_name - targets.append(module_name) - - if node.type == "import_from_statement": - module_name = "" - module_node = node.child_by_field_name("module_name") - if module_node is None: - for child in node.children: - if child.type == "dotted_name": - module_node = child - break - if module_node is not None: - module_name = self._node_text(source_bytes, module_node).strip() - - for child in node.children: - if child.type == "aliased_import": - name_node = child.child_by_field_name("name") - alias_node = child.child_by_field_name("alias") - if name_node is None: - continue - imported_name = self._node_text(source_bytes, name_node).strip() - if not imported_name or imported_name == "*": - continue - target = f"{module_name}.{imported_name}" if module_name else imported_name - bound_name = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else imported_name - ) - if bound_name: - aliases[bound_name] = target - targets.append(target) - elif child.type == "identifier": - imported_name = self._node_text(source_bytes, child).strip() - if not imported_name or imported_name in {"from", "import", "*"}: - continue - target = f"{module_name}.{imported_name}" if module_name else imported_name - aliases[imported_name] = target - targets.append(target) - - return aliases, targets - - def _js_expression_to_dotted(self, source_bytes: bytes, node: TreeSitterNode) -> str: - if node.type in {"this", "super"}: - return node.type - if node.type in {"identifier", "property_identifier"}: - return self._node_text(source_bytes, node).strip() - if node.type == "member_expression": - obj = node.child_by_field_name("object") - prop = node.child_by_field_name("property") - obj_text = self._js_expression_to_dotted(source_bytes, obj) if obj is not None else "" - prop_text = self._js_expression_to_dotted(source_bytes, prop) if prop is not None else "" - if obj_text and prop_text: - return f"{obj_text}.{prop_text}" - return obj_text or prop_text - return "" - - def _js_import_aliases_and_targets( - self, - source_bytes: bytes, - node: TreeSitterNode, - ) -> tuple[Dict[str, str], List[str]]: - aliases: Dict[str, str] = {} - targets: List[str] = [] - - module_name = "" - source_node = node.child_by_field_name("source") - if source_node is not None: - module_name = self._node_text(source_bytes, source_node).strip().strip("\"'").strip() - if module_name: - targets.append(module_name) - - for child in node.children: - if child.type == "import_clause": - for clause_child in child.children: - if clause_child.type == "identifier": - # Default import: import React from "react" - local = self._node_text(source_bytes, clause_child).strip() - if local and module_name: - aliases[local] = module_name - if clause_child.type == "namespace_import": - # Namespace import: import * as fs from "fs" - name_node = clause_child.child_by_field_name("name") - if name_node is not None and module_name: - local = self._node_text(source_bytes, name_node).strip() - if local: - aliases[local] = module_name - if clause_child.type == "named_imports": - for spec in clause_child.children: - if spec.type != "import_specifier": - continue - name_node = spec.child_by_field_name("name") - alias_node = spec.child_by_field_name("alias") - if name_node is None: - continue - imported = self._node_text(source_bytes, name_node).strip() - if not imported: - continue - local = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else imported - ) - if local and module_name: - aliases[local] = f"{module_name}.{imported}" - targets.append(f"{module_name}.{imported}") - - return aliases, targets - - def _js_first_string_argument(self, source_bytes: bytes, args_node: TreeSitterNode) -> str: - for child in args_node.children: - if child.type == "string": - return self._node_text(source_bytes, child).strip().strip("\"'").strip() - return "" - - def _extract_python_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract Python symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of Python symbols (classes, functions, methods) - """ - symbols: List[Symbol] = [] - - for node in self._iter_nodes(root): - if node.type == "class_definition": - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="class", - range=self._node_range(node), - )) - elif node.type in {"function_definition", "async_function_definition"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind=self._python_function_kind(node), - range=self._node_range(node), - )) - - return symbols - - def _extract_js_ts_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract JavaScript/TypeScript symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of JS/TS symbols (classes, functions, methods) - """ - symbols: List[Symbol] = [] - - for node in self._iter_nodes(root): - if node.type in {"class_declaration", "class"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="class", - range=self._node_range(node), - )) - elif node.type in {"function_declaration", "generator_function_declaration"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="function", - range=self._node_range(node), - )) - elif node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is None - or value_node is None - or name_node.type not in {"identifier", "property_identifier"} - or value_node.type != "arrow_function" - ): - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="function", - range=self._node_range(node), - )) - elif node.type == "method_definition" and self._has_class_ancestor(node): - name_node = node.child_by_field_name("name") - if name_node is None: - continue - name = self._node_text(source_bytes, name_node) - if name == "constructor": - continue - symbols.append(Symbol( - name=name, - kind="method", - range=self._node_range(node), - )) - - return symbols - - def _python_function_kind(self, node: TreeSitterNode) -> str: - """Determine if Python function is a method or standalone function. - - Args: - node: Function definition node - - Returns: - 'method' if inside a class, 'function' otherwise - """ - parent = node.parent - while parent is not None: - if parent.type in {"function_definition", "async_function_definition"}: - return "function" - if parent.type == "class_definition": - return "method" - parent = parent.parent - return "function" - - def _has_class_ancestor(self, node: TreeSitterNode) -> bool: - """Check if node has a class ancestor. - - Args: - node: AST node to check - - Returns: - True if node is inside a class - """ - parent = node.parent - while parent is not None: - if parent.type in {"class_declaration", "class"}: - return True - parent = parent.parent - return False - - def _iter_nodes(self, root: TreeSitterNode): - """Iterate over all nodes in AST. - - Args: - root: Root node to start iteration - - Yields: - AST nodes in depth-first order - """ - stack = [root] - while stack: - node = stack.pop() - yield node - for child in reversed(node.children): - stack.append(child) - - def _node_text(self, source_bytes: bytes, node: TreeSitterNode) -> str: - """Extract text for a node. - - Args: - source_bytes: Source code as bytes - node: AST node - - Returns: - Text content of node - """ - return source_bytes[node.start_byte:node.end_byte].decode("utf8") - - def _node_range(self, node: TreeSitterNode) -> tuple[int, int]: - """Get line range for a node. - - Args: - node: AST node - - Returns: - (start_line, end_line) tuple, 1-based inclusive - """ - start_line = node.start_point[0] + 1 - end_line = node.end_point[0] + 1 - return (start_line, max(start_line, end_line)) - - def count_tokens(self, text: str) -> int: - """Count tokens in text. - - Args: - text: Text to count tokens for - - Returns: - Token count - """ - return self._tokenizer.count_tokens(text) diff --git a/codex-lens/build/lib/codexlens/search/__init__.py b/codex-lens/build/lib/codexlens/search/__init__.py deleted file mode 100644 index 46c660f4..00000000 --- a/codex-lens/build/lib/codexlens/search/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from .chain_search import ( - ChainSearchEngine, - SearchOptions, - SearchStats, - ChainSearchResult, - quick_search, -) - -# Clustering availability flag (lazy import pattern) -CLUSTERING_AVAILABLE = False -_clustering_import_error: str | None = None - -try: - from .clustering import CLUSTERING_AVAILABLE as _clustering_flag - from .clustering import check_clustering_available - CLUSTERING_AVAILABLE = _clustering_flag -except ImportError as e: - _clustering_import_error = str(e) - - def check_clustering_available() -> tuple[bool, str | None]: - """Fallback when clustering module not loadable.""" - return False, _clustering_import_error - - -# Clustering module exports (conditional) -try: - from .clustering import ( - BaseClusteringStrategy, - ClusteringConfig, - ClusteringStrategyFactory, - get_strategy, - ) - _clustering_exports = [ - "BaseClusteringStrategy", - "ClusteringConfig", - "ClusteringStrategyFactory", - "get_strategy", - ] -except ImportError: - _clustering_exports = [] - - -__all__ = [ - "ChainSearchEngine", - "SearchOptions", - "SearchStats", - "ChainSearchResult", - "quick_search", - # Clustering - "CLUSTERING_AVAILABLE", - "check_clustering_available", - *_clustering_exports, -] diff --git a/codex-lens/build/lib/codexlens/search/association_tree/__init__.py b/codex-lens/build/lib/codexlens/search/association_tree/__init__.py deleted file mode 100644 index 9557af33..00000000 --- a/codex-lens/build/lib/codexlens/search/association_tree/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Association tree module for LSP-based code relationship discovery. - -This module provides components for building and processing call association trees -using Language Server Protocol (LSP) call hierarchy capabilities. -""" - -from .builder import AssociationTreeBuilder -from .data_structures import ( - CallTree, - TreeNode, - UniqueNode, -) -from .deduplicator import ResultDeduplicator - -__all__ = [ - "AssociationTreeBuilder", - "CallTree", - "TreeNode", - "UniqueNode", - "ResultDeduplicator", -] diff --git a/codex-lens/build/lib/codexlens/search/association_tree/builder.py b/codex-lens/build/lib/codexlens/search/association_tree/builder.py deleted file mode 100644 index 894a8e20..00000000 --- a/codex-lens/build/lib/codexlens/search/association_tree/builder.py +++ /dev/null @@ -1,450 +0,0 @@ -"""Association tree builder using LSP call hierarchy. - -Builds call relationship trees by recursively expanding from seed locations -using Language Server Protocol (LSP) call hierarchy capabilities. -""" - -from __future__ import annotations - -import asyncio -import logging -from pathlib import Path -from typing import Dict, List, Optional, Set - -from codexlens.hybrid_search.data_structures import CallHierarchyItem, Range -from codexlens.lsp.standalone_manager import StandaloneLspManager -from .data_structures import CallTree, TreeNode - -logger = logging.getLogger(__name__) - - -class AssociationTreeBuilder: - """Builds association trees from seed locations using LSP call hierarchy. - - Uses depth-first recursive expansion to build a tree of code relationships - starting from seed locations (typically from vector search results). - - Strategy: - - Start from seed locations (vector search results) - - For each seed, get call hierarchy items via LSP - - Recursively expand incoming calls (callers) if expand_callers=True - - Recursively expand outgoing calls (callees) if expand_callees=True - - Track visited nodes to prevent cycles - - Stop at max_depth or when no more relations found - - Attributes: - lsp_manager: StandaloneLspManager for LSP communication - visited: Set of visited node IDs to prevent cycles - timeout: Timeout for individual LSP requests (seconds) - """ - - def __init__( - self, - lsp_manager: StandaloneLspManager, - timeout: float = 5.0, - analysis_wait: float = 2.0, - ): - """Initialize AssociationTreeBuilder. - - Args: - lsp_manager: StandaloneLspManager instance for LSP communication - timeout: Timeout for individual LSP requests in seconds - analysis_wait: Time to wait for LSP analysis on first file (seconds) - """ - self.lsp_manager = lsp_manager - self.timeout = timeout - self.analysis_wait = analysis_wait - self.visited: Set[str] = set() - self._analyzed_files: Set[str] = set() # Track files already analyzed - - async def build_tree( - self, - seed_file_path: str, - seed_line: int, - seed_character: int = 1, - max_depth: int = 5, - expand_callers: bool = True, - expand_callees: bool = True, - ) -> CallTree: - """Build call tree from a single seed location. - - Args: - seed_file_path: Path to the seed file - seed_line: Line number of the seed symbol (1-based) - seed_character: Character position (1-based, default 1) - max_depth: Maximum recursion depth (default 5) - expand_callers: Whether to expand incoming calls (callers) - expand_callees: Whether to expand outgoing calls (callees) - - Returns: - CallTree containing all discovered nodes and relationships - """ - tree = CallTree() - self.visited.clear() - - # Determine wait time - only wait for analysis on first encounter of file - wait_time = 0.0 - if seed_file_path not in self._analyzed_files: - wait_time = self.analysis_wait - self._analyzed_files.add(seed_file_path) - - # Get call hierarchy items for the seed position - try: - hierarchy_items = await asyncio.wait_for( - self.lsp_manager.get_call_hierarchy_items( - file_path=seed_file_path, - line=seed_line, - character=seed_character, - wait_for_analysis=wait_time, - ), - timeout=self.timeout + wait_time, - ) - except asyncio.TimeoutError: - logger.warning( - "Timeout getting call hierarchy items for %s:%d", - seed_file_path, - seed_line, - ) - return tree - except Exception as e: - logger.error( - "Error getting call hierarchy items for %s:%d: %s", - seed_file_path, - seed_line, - e, - ) - return tree - - if not hierarchy_items: - logger.debug( - "No call hierarchy items found for %s:%d", - seed_file_path, - seed_line, - ) - return tree - - # Create root nodes from hierarchy items - for item_dict in hierarchy_items: - # Convert LSP dict to CallHierarchyItem - item = self._dict_to_call_hierarchy_item(item_dict) - if not item: - continue - - root_node = TreeNode( - item=item, - depth=0, - path_from_root=[self._create_node_id(item)], - ) - tree.roots.append(root_node) - tree.add_node(root_node) - - # Mark as visited - self.visited.add(root_node.node_id) - - # Recursively expand the tree - await self._expand_node( - node=root_node, - node_dict=item_dict, - tree=tree, - current_depth=0, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - tree.depth_reached = max_depth - return tree - - async def _expand_node( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Recursively expand a node by fetching its callers and callees. - - Args: - node: TreeNode to expand - node_dict: LSP CallHierarchyItem dict (for LSP requests) - tree: CallTree to add discovered nodes to - current_depth: Current recursion depth - max_depth: Maximum allowed depth - expand_callers: Whether to expand incoming calls - expand_callees: Whether to expand outgoing calls - """ - # Stop if max depth reached - if current_depth >= max_depth: - return - - # Prepare tasks for parallel expansion - tasks = [] - - if expand_callers: - tasks.append( - self._expand_incoming_calls( - node=node, - node_dict=node_dict, - tree=tree, - current_depth=current_depth, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - ) - - if expand_callees: - tasks.append( - self._expand_outgoing_calls( - node=node, - node_dict=node_dict, - tree=tree, - current_depth=current_depth, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - ) - - # Execute expansions in parallel - if tasks: - await asyncio.gather(*tasks, return_exceptions=True) - - async def _expand_incoming_calls( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Expand incoming calls (callers) for a node. - - Args: - node: TreeNode being expanded - node_dict: LSP dict for the node - tree: CallTree to add nodes to - current_depth: Current depth - max_depth: Maximum depth - expand_callers: Whether to continue expanding callers - expand_callees: Whether to expand callees - """ - try: - incoming_calls = await asyncio.wait_for( - self.lsp_manager.get_incoming_calls(item=node_dict), - timeout=self.timeout, - ) - except asyncio.TimeoutError: - logger.debug("Timeout getting incoming calls for %s", node.node_id) - return - except Exception as e: - logger.debug("Error getting incoming calls for %s: %s", node.node_id, e) - return - - if not incoming_calls: - return - - # Process each incoming call - for call_dict in incoming_calls: - caller_dict = call_dict.get("from") - if not caller_dict: - continue - - # Convert to CallHierarchyItem - caller_item = self._dict_to_call_hierarchy_item(caller_dict) - if not caller_item: - continue - - caller_id = self._create_node_id(caller_item) - - # Check for cycles - if caller_id in self.visited: - # Create cycle marker node - cycle_node = TreeNode( - item=caller_item, - depth=current_depth + 1, - is_cycle=True, - path_from_root=node.path_from_root + [caller_id], - ) - node.parents.append(cycle_node) - continue - - # Create new caller node - caller_node = TreeNode( - item=caller_item, - depth=current_depth + 1, - path_from_root=node.path_from_root + [caller_id], - ) - - # Add to tree - tree.add_node(caller_node) - tree.add_edge(caller_node, node) - - # Update relationships - node.parents.append(caller_node) - caller_node.children.append(node) - - # Mark as visited - self.visited.add(caller_id) - - # Recursively expand the caller - await self._expand_node( - node=caller_node, - node_dict=caller_dict, - tree=tree, - current_depth=current_depth + 1, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - async def _expand_outgoing_calls( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Expand outgoing calls (callees) for a node. - - Args: - node: TreeNode being expanded - node_dict: LSP dict for the node - tree: CallTree to add nodes to - current_depth: Current depth - max_depth: Maximum depth - expand_callers: Whether to expand callers - expand_callees: Whether to continue expanding callees - """ - try: - outgoing_calls = await asyncio.wait_for( - self.lsp_manager.get_outgoing_calls(item=node_dict), - timeout=self.timeout, - ) - except asyncio.TimeoutError: - logger.debug("Timeout getting outgoing calls for %s", node.node_id) - return - except Exception as e: - logger.debug("Error getting outgoing calls for %s: %s", node.node_id, e) - return - - if not outgoing_calls: - return - - # Process each outgoing call - for call_dict in outgoing_calls: - callee_dict = call_dict.get("to") - if not callee_dict: - continue - - # Convert to CallHierarchyItem - callee_item = self._dict_to_call_hierarchy_item(callee_dict) - if not callee_item: - continue - - callee_id = self._create_node_id(callee_item) - - # Check for cycles - if callee_id in self.visited: - # Create cycle marker node - cycle_node = TreeNode( - item=callee_item, - depth=current_depth + 1, - is_cycle=True, - path_from_root=node.path_from_root + [callee_id], - ) - node.children.append(cycle_node) - continue - - # Create new callee node - callee_node = TreeNode( - item=callee_item, - depth=current_depth + 1, - path_from_root=node.path_from_root + [callee_id], - ) - - # Add to tree - tree.add_node(callee_node) - tree.add_edge(node, callee_node) - - # Update relationships - node.children.append(callee_node) - callee_node.parents.append(node) - - # Mark as visited - self.visited.add(callee_id) - - # Recursively expand the callee - await self._expand_node( - node=callee_node, - node_dict=callee_dict, - tree=tree, - current_depth=current_depth + 1, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - def _dict_to_call_hierarchy_item( - self, item_dict: Dict - ) -> Optional[CallHierarchyItem]: - """Convert LSP dict to CallHierarchyItem. - - Args: - item_dict: LSP CallHierarchyItem dictionary - - Returns: - CallHierarchyItem or None if conversion fails - """ - try: - # Extract URI and convert to file path - uri = item_dict.get("uri", "") - file_path = uri.replace("file:///", "").replace("file://", "") - - # Handle Windows paths (file:///C:/...) - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - # Extract range - range_dict = item_dict.get("range", {}) - start = range_dict.get("start", {}) - end = range_dict.get("end", {}) - - # Create Range (convert from 0-based to 1-based) - item_range = Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ) - - return CallHierarchyItem( - name=item_dict.get("name", "unknown"), - kind=str(item_dict.get("kind", "unknown")), - file_path=file_path, - range=item_range, - detail=item_dict.get("detail"), - ) - - except Exception as e: - logger.debug("Failed to convert dict to CallHierarchyItem: %s", e) - return None - - def _create_node_id(self, item: CallHierarchyItem) -> str: - """Create unique node ID from CallHierarchyItem. - - Args: - item: CallHierarchyItem - - Returns: - Unique node ID string - """ - return f"{item.file_path}:{item.name}:{item.range.start_line}" diff --git a/codex-lens/build/lib/codexlens/search/association_tree/data_structures.py b/codex-lens/build/lib/codexlens/search/association_tree/data_structures.py deleted file mode 100644 index 2c8b47fa..00000000 --- a/codex-lens/build/lib/codexlens/search/association_tree/data_structures.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Data structures for association tree building. - -Defines the core data classes for representing call hierarchy trees and -deduplicated results. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from codexlens.hybrid_search.data_structures import CallHierarchyItem, Range - - -@dataclass -class TreeNode: - """Node in the call association tree. - - Represents a single function/method in the tree, including its position - in the hierarchy and relationships. - - Attributes: - item: LSP CallHierarchyItem containing symbol information - depth: Distance from the root node (seed) - 0 for roots - children: List of child nodes (functions called by this node) - parents: List of parent nodes (functions that call this node) - is_cycle: Whether this node creates a circular reference - path_from_root: Path (list of node IDs) from root to this node - """ - - item: CallHierarchyItem - depth: int = 0 - children: List[TreeNode] = field(default_factory=list) - parents: List[TreeNode] = field(default_factory=list) - is_cycle: bool = False - path_from_root: List[str] = field(default_factory=list) - - @property - def node_id(self) -> str: - """Unique identifier for this node.""" - return f"{self.item.file_path}:{self.item.name}:{self.item.range.start_line}" - - def __hash__(self) -> int: - """Hash based on node ID.""" - return hash(self.node_id) - - def __eq__(self, other: object) -> bool: - """Equality based on node ID.""" - if not isinstance(other, TreeNode): - return False - return self.node_id == other.node_id - - def __repr__(self) -> str: - """String representation of the node.""" - cycle_marker = " [CYCLE]" if self.is_cycle else "" - return f"TreeNode({self.item.name}@{self.item.file_path}:{self.item.range.start_line}){cycle_marker}" - - -@dataclass -class CallTree: - """Complete call tree structure built from seeds. - - Contains all nodes discovered through recursive expansion and - the relationships between them. - - Attributes: - roots: List of root nodes (seed symbols) - all_nodes: Dictionary mapping node_id -> TreeNode for quick lookup - node_list: Flat list of all nodes in tree order - edges: List of (from_node_id, to_node_id) tuples representing calls - depth_reached: Maximum depth achieved in expansion - """ - - roots: List[TreeNode] = field(default_factory=list) - all_nodes: Dict[str, TreeNode] = field(default_factory=dict) - node_list: List[TreeNode] = field(default_factory=list) - edges: List[tuple[str, str]] = field(default_factory=list) - depth_reached: int = 0 - - def add_node(self, node: TreeNode) -> None: - """Add a node to the tree. - - Args: - node: TreeNode to add - """ - if node.node_id not in self.all_nodes: - self.all_nodes[node.node_id] = node - self.node_list.append(node) - - def add_edge(self, from_node: TreeNode, to_node: TreeNode) -> None: - """Add an edge between two nodes. - - Args: - from_node: Source node - to_node: Target node - """ - edge = (from_node.node_id, to_node.node_id) - if edge not in self.edges: - self.edges.append(edge) - - def get_node(self, node_id: str) -> Optional[TreeNode]: - """Get a node by ID. - - Args: - node_id: Node identifier - - Returns: - TreeNode if found, None otherwise - """ - return self.all_nodes.get(node_id) - - def __len__(self) -> int: - """Return total number of nodes in tree.""" - return len(self.all_nodes) - - def __repr__(self) -> str: - """String representation of the tree.""" - return ( - f"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, " - f"depth={self.depth_reached})" - ) - - -@dataclass -class UniqueNode: - """Deduplicated unique code symbol from the tree. - - Represents a single unique code location that may appear multiple times - in the tree under different contexts. Contains aggregated information - about all occurrences. - - Attributes: - file_path: Absolute path to the file - name: Symbol name (function, method, class, etc.) - kind: Symbol kind (function, method, class, etc.) - range: Code range in the file - min_depth: Minimum depth at which this node appears in the tree - occurrences: Number of times this node appears in the tree - paths: List of paths from roots to this node - context_nodes: Related nodes from the tree - score: Composite relevance score (higher is better) - """ - - file_path: str - name: str - kind: str - range: Range - min_depth: int = 0 - occurrences: int = 1 - paths: List[List[str]] = field(default_factory=list) - context_nodes: List[str] = field(default_factory=list) - score: float = 0.0 - - @property - def node_key(self) -> tuple[str, int, int]: - """Unique key for deduplication. - - Uses (file_path, start_line, end_line) as the unique identifier - for this symbol across all occurrences. - """ - return ( - self.file_path, - self.range.start_line, - self.range.end_line, - ) - - def add_path(self, path: List[str]) -> None: - """Add a path from root to this node. - - Args: - path: List of node IDs from root to this node - """ - if path not in self.paths: - self.paths.append(path) - - def __hash__(self) -> int: - """Hash based on node key.""" - return hash(self.node_key) - - def __eq__(self, other: object) -> bool: - """Equality based on node key.""" - if not isinstance(other, UniqueNode): - return False - return self.node_key == other.node_key - - def __repr__(self) -> str: - """String representation of the unique node.""" - return ( - f"UniqueNode({self.name}@{self.file_path}:{self.range.start_line}, " - f"depth={self.min_depth}, occ={self.occurrences}, score={self.score:.2f})" - ) diff --git a/codex-lens/build/lib/codexlens/search/association_tree/deduplicator.py b/codex-lens/build/lib/codexlens/search/association_tree/deduplicator.py deleted file mode 100644 index 9e590518..00000000 --- a/codex-lens/build/lib/codexlens/search/association_tree/deduplicator.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Result deduplication for association tree nodes. - -Provides functionality to extract unique nodes from a call tree and assign -relevance scores based on various factors. -""" - -from __future__ import annotations - -import logging -from typing import Dict, List, Optional - -from .data_structures import ( - CallTree, - TreeNode, - UniqueNode, -) - -logger = logging.getLogger(__name__) - - -# Symbol kind weights for scoring (higher = more relevant) -KIND_WEIGHTS: Dict[str, float] = { - # Functions and methods are primary targets - "function": 1.0, - "method": 1.0, - "12": 1.0, # LSP SymbolKind.Function - "6": 1.0, # LSP SymbolKind.Method - # Classes are important but secondary - "class": 0.8, - "5": 0.8, # LSP SymbolKind.Class - # Interfaces and types - "interface": 0.7, - "11": 0.7, # LSP SymbolKind.Interface - "type": 0.6, - # Constructors - "constructor": 0.9, - "9": 0.9, # LSP SymbolKind.Constructor - # Variables and constants - "variable": 0.4, - "13": 0.4, # LSP SymbolKind.Variable - "constant": 0.5, - "14": 0.5, # LSP SymbolKind.Constant - # Default for unknown kinds - "unknown": 0.3, -} - - -class ResultDeduplicator: - """Extracts and scores unique nodes from call trees. - - Processes a CallTree to extract unique code locations, merging duplicates - and assigning relevance scores based on: - - Depth: Shallower nodes (closer to seeds) score higher - - Frequency: Nodes appearing multiple times score higher - - Kind: Function/method > class > variable - - Attributes: - depth_weight: Weight for depth factor in scoring (default 0.4) - frequency_weight: Weight for frequency factor (default 0.3) - kind_weight: Weight for symbol kind factor (default 0.3) - max_depth_penalty: Maximum depth before full penalty applied - """ - - def __init__( - self, - depth_weight: float = 0.4, - frequency_weight: float = 0.3, - kind_weight: float = 0.3, - max_depth_penalty: int = 10, - ): - """Initialize ResultDeduplicator. - - Args: - depth_weight: Weight for depth factor (0.0-1.0) - frequency_weight: Weight for frequency factor (0.0-1.0) - kind_weight: Weight for symbol kind factor (0.0-1.0) - max_depth_penalty: Depth at which score becomes 0 for depth factor - """ - self.depth_weight = depth_weight - self.frequency_weight = frequency_weight - self.kind_weight = kind_weight - self.max_depth_penalty = max_depth_penalty - - def deduplicate( - self, - tree: CallTree, - max_results: Optional[int] = None, - ) -> List[UniqueNode]: - """Extract unique nodes from the call tree. - - Traverses the tree, groups nodes by their unique key (file_path, - start_line, end_line), and merges duplicate occurrences. - - Args: - tree: CallTree to process - max_results: Maximum number of results to return (None = all) - - Returns: - List of UniqueNode objects, sorted by score descending - """ - if not tree.node_list: - return [] - - # Group nodes by unique key - unique_map: Dict[tuple, UniqueNode] = {} - - for node in tree.node_list: - if node.is_cycle: - # Skip cycle markers - they point to already-counted nodes - continue - - key = self._get_node_key(node) - - if key in unique_map: - # Update existing unique node - unique_node = unique_map[key] - unique_node.occurrences += 1 - unique_node.min_depth = min(unique_node.min_depth, node.depth) - unique_node.add_path(node.path_from_root) - - # Collect context from relationships - for parent in node.parents: - if not parent.is_cycle: - unique_node.context_nodes.append(parent.node_id) - for child in node.children: - if not child.is_cycle: - unique_node.context_nodes.append(child.node_id) - else: - # Create new unique node - unique_node = UniqueNode( - file_path=node.item.file_path, - name=node.item.name, - kind=node.item.kind, - range=node.item.range, - min_depth=node.depth, - occurrences=1, - paths=[node.path_from_root.copy()], - context_nodes=[], - score=0.0, - ) - - # Collect initial context - for parent in node.parents: - if not parent.is_cycle: - unique_node.context_nodes.append(parent.node_id) - for child in node.children: - if not child.is_cycle: - unique_node.context_nodes.append(child.node_id) - - unique_map[key] = unique_node - - # Calculate scores for all unique nodes - unique_nodes = list(unique_map.values()) - - # Find max frequency for normalization - max_frequency = max((n.occurrences for n in unique_nodes), default=1) - - for node in unique_nodes: - node.score = self._score_node(node, max_frequency) - - # Sort by score descending - unique_nodes.sort(key=lambda n: n.score, reverse=True) - - # Apply max_results limit - if max_results is not None and max_results > 0: - unique_nodes = unique_nodes[:max_results] - - logger.debug( - "Deduplicated %d tree nodes to %d unique nodes", - len(tree.node_list), - len(unique_nodes), - ) - - return unique_nodes - - def _score_node( - self, - node: UniqueNode, - max_frequency: int, - ) -> float: - """Calculate composite score for a unique node. - - Score = depth_weight * depth_score + - frequency_weight * frequency_score + - kind_weight * kind_score - - Args: - node: UniqueNode to score - max_frequency: Maximum occurrence count for normalization - - Returns: - Composite score between 0.0 and 1.0 - """ - # Depth score: closer to root = higher score - # Score of 1.0 at depth 0, decreasing to 0.0 at max_depth_penalty - depth_score = max( - 0.0, - 1.0 - (node.min_depth / self.max_depth_penalty), - ) - - # Frequency score: more occurrences = higher score - frequency_score = node.occurrences / max_frequency if max_frequency > 0 else 0.0 - - # Kind score: function/method > class > variable - kind_str = str(node.kind).lower() - kind_score = KIND_WEIGHTS.get(kind_str, KIND_WEIGHTS["unknown"]) - - # Composite score - score = ( - self.depth_weight * depth_score - + self.frequency_weight * frequency_score - + self.kind_weight * kind_score - ) - - return score - - def _get_node_key(self, node: TreeNode) -> tuple: - """Get unique key for a tree node. - - Uses (file_path, start_line, end_line) as the unique identifier. - - Args: - node: TreeNode - - Returns: - Tuple key for deduplication - """ - return ( - node.item.file_path, - node.item.range.start_line, - node.item.range.end_line, - ) - - def filter_by_kind( - self, - nodes: List[UniqueNode], - kinds: List[str], - ) -> List[UniqueNode]: - """Filter unique nodes by symbol kind. - - Args: - nodes: List of UniqueNode to filter - kinds: List of allowed kinds (e.g., ["function", "method"]) - - Returns: - Filtered list of UniqueNode - """ - kinds_lower = [k.lower() for k in kinds] - return [ - node - for node in nodes - if str(node.kind).lower() in kinds_lower - ] - - def filter_by_file( - self, - nodes: List[UniqueNode], - file_patterns: List[str], - ) -> List[UniqueNode]: - """Filter unique nodes by file path patterns. - - Args: - nodes: List of UniqueNode to filter - file_patterns: List of path substrings to match - - Returns: - Filtered list of UniqueNode - """ - return [ - node - for node in nodes - if any(pattern in node.file_path for pattern in file_patterns) - ] - - def to_dict_list(self, nodes: List[UniqueNode]) -> List[Dict]: - """Convert list of UniqueNode to JSON-serializable dicts. - - Args: - nodes: List of UniqueNode - - Returns: - List of dictionaries - """ - return [ - { - "file_path": node.file_path, - "name": node.name, - "kind": node.kind, - "range": { - "start_line": node.range.start_line, - "start_character": node.range.start_character, - "end_line": node.range.end_line, - "end_character": node.range.end_character, - }, - "min_depth": node.min_depth, - "occurrences": node.occurrences, - "path_count": len(node.paths), - "score": round(node.score, 4), - } - for node in nodes - ] diff --git a/codex-lens/build/lib/codexlens/search/binary_searcher.py b/codex-lens/build/lib/codexlens/search/binary_searcher.py deleted file mode 100644 index c37256f1..00000000 --- a/codex-lens/build/lib/codexlens/search/binary_searcher.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Binary vector searcher for cascade search. - -This module provides fast binary vector search using Hamming distance -for the first stage of cascade search (coarse filtering). - -Supports two loading modes: -1. Memory-mapped file (preferred): Low memory footprint, OS-managed paging -2. Database loading (fallback): Loads all vectors into RAM -""" - -from __future__ import annotations - -import json -import logging -from pathlib import Path -from typing import List, Optional, Tuple - -import numpy as np - -logger = logging.getLogger(__name__) - -# Pre-computed popcount lookup table for vectorized Hamming distance -# Each byte value (0-255) maps to its bit count -_POPCOUNT_TABLE = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) - - -class BinarySearcher: - """Fast binary vector search using Hamming distance. - - This class implements the first stage of cascade search: - fast, approximate retrieval using binary vectors and Hamming distance. - - The binary vectors are derived from dense embeddings by thresholding: - binary[i] = 1 if dense[i] > 0 else 0 - - Hamming distance between two binary vectors counts the number of - differing bits, which can be computed very efficiently using XOR - and population count. - - Supports two loading modes: - - Memory-mapped file (preferred): Uses np.memmap for minimal RAM usage - - Database (fallback): Loads all vectors into memory from SQLite - """ - - def __init__(self, index_root_or_meta_path: Path) -> None: - """Initialize BinarySearcher. - - Args: - index_root_or_meta_path: Either: - - Path to index root directory (containing _binary_vectors.mmap) - - Path to _vectors_meta.db (legacy mode, loads from DB) - """ - path = Path(index_root_or_meta_path) - - # Determine if this is an index root or a specific DB path - if path.suffix == '.db': - # Legacy mode: specific DB path - self.index_root = path.parent - self.meta_store_path = path - else: - # New mode: index root directory - self.index_root = path - self.meta_store_path = path / "_vectors_meta.db" - - self._chunk_ids: Optional[np.ndarray] = None - self._binary_matrix: Optional[np.ndarray] = None - self._is_memmap = False - self._loaded = False - - def load(self) -> bool: - """Load binary vectors using memory-mapped file or database fallback. - - Tries to load from memory-mapped file first (preferred for large indexes), - falls back to database loading if mmap file doesn't exist. - - Returns: - True if vectors were loaded successfully. - """ - if self._loaded: - return True - - # Try memory-mapped file first (preferred) - mmap_path = self.index_root / "_binary_vectors.mmap" - meta_path = mmap_path.with_suffix('.meta.json') - - if mmap_path.exists() and meta_path.exists(): - try: - with open(meta_path, 'r') as f: - meta = json.load(f) - - shape = tuple(meta['shape']) - self._chunk_ids = np.array(meta['chunk_ids'], dtype=np.int64) - - # Memory-map the binary matrix (read-only) - self._binary_matrix = np.memmap( - str(mmap_path), - dtype=np.uint8, - mode='r', - shape=shape - ) - self._is_memmap = True - self._loaded = True - - logger.info( - "Memory-mapped %d binary vectors (%d bytes each)", - len(self._chunk_ids), shape[1] - ) - return True - - except Exception as e: - logger.warning("Failed to load mmap binary vectors, falling back to DB: %s", e) - - # Fallback: load from database - return self._load_from_db() - - def _load_from_db(self) -> bool: - """Load binary vectors from database (legacy/fallback mode). - - Returns: - True if vectors were loaded successfully. - """ - try: - from codexlens.storage.vector_meta_store import VectorMetadataStore - - with VectorMetadataStore(self.meta_store_path) as store: - rows = store.get_all_binary_vectors() - - if not rows: - logger.warning("No binary vectors found in %s", self.meta_store_path) - return False - - # Convert to numpy arrays for fast computation - self._chunk_ids = np.array([r[0] for r in rows], dtype=np.int64) - - # Unpack bytes to numpy array - binary_arrays = [] - for _, vec_bytes in rows: - arr = np.frombuffer(vec_bytes, dtype=np.uint8) - binary_arrays.append(arr) - - self._binary_matrix = np.vstack(binary_arrays) - self._is_memmap = False - self._loaded = True - - logger.info( - "Loaded %d binary vectors from DB (%d bytes each)", - len(self._chunk_ids), self._binary_matrix.shape[1] - ) - return True - - except Exception as e: - logger.error("Failed to load binary vectors: %s", e) - return False - - def search( - self, - query_vector: np.ndarray, - top_k: int = 100 - ) -> List[Tuple[int, int]]: - """Search for similar vectors using Hamming distance. - - Args: - query_vector: Dense query vector (will be binarized). - top_k: Number of top results to return. - - Returns: - List of (chunk_id, hamming_distance) tuples sorted by distance. - """ - if not self._loaded and not self.load(): - return [] - - # Binarize query vector - query_binary = (query_vector > 0).astype(np.uint8) - query_packed = np.packbits(query_binary) - - # Compute Hamming distances using XOR and popcount - # XOR gives 1 for differing bits - xor_result = np.bitwise_xor(self._binary_matrix, query_packed) - - # Vectorized popcount using lookup table (orders of magnitude faster) - # Sum the bit counts for each byte across all columns - distances = np.sum(_POPCOUNT_TABLE[xor_result], axis=1, dtype=np.int32) - - # Get top-k with smallest distances - if top_k >= len(distances): - top_indices = np.argsort(distances) - else: - # Partial sort for efficiency - top_indices = np.argpartition(distances, top_k)[:top_k] - top_indices = top_indices[np.argsort(distances[top_indices])] - - results = [ - (int(self._chunk_ids[i]), int(distances[i])) - for i in top_indices - ] - - return results - - def search_with_rerank( - self, - query_dense: np.ndarray, - dense_vectors: np.ndarray, - dense_chunk_ids: np.ndarray, - top_k: int = 10, - candidates: int = 100 - ) -> List[Tuple[int, float]]: - """Two-stage cascade search: binary filter + dense rerank. - - Args: - query_dense: Dense query vector. - dense_vectors: Dense vectors for reranking (from HNSW or stored). - dense_chunk_ids: Chunk IDs corresponding to dense_vectors. - top_k: Final number of results. - candidates: Number of candidates from binary search. - - Returns: - List of (chunk_id, cosine_similarity) tuples. - """ - # Stage 1: Binary filtering - binary_results = self.search(query_dense, top_k=candidates) - if not binary_results: - return [] - - candidate_ids = {r[0] for r in binary_results} - - # Stage 2: Dense reranking - # Find indices of candidates in dense_vectors - candidate_mask = np.isin(dense_chunk_ids, list(candidate_ids)) - candidate_indices = np.where(candidate_mask)[0] - - if len(candidate_indices) == 0: - # Fallback: return binary results with normalized distance - max_dist = max(r[1] for r in binary_results) if binary_results else 1 - return [(r[0], 1.0 - r[1] / max_dist) for r in binary_results[:top_k]] - - # Compute cosine similarities for candidates - candidate_vectors = dense_vectors[candidate_indices] - candidate_ids_array = dense_chunk_ids[candidate_indices] - - # Normalize vectors - query_norm = query_dense / (np.linalg.norm(query_dense) + 1e-8) - cand_norms = candidate_vectors / ( - np.linalg.norm(candidate_vectors, axis=1, keepdims=True) + 1e-8 - ) - - # Cosine similarities - similarities = np.dot(cand_norms, query_norm) - - # Sort by similarity (descending) - sorted_indices = np.argsort(-similarities)[:top_k] - - results = [ - (int(candidate_ids_array[i]), float(similarities[i])) - for i in sorted_indices - ] - - return results - - @property - def vector_count(self) -> int: - """Get number of loaded binary vectors.""" - return len(self._chunk_ids) if self._chunk_ids is not None else 0 - - @property - def is_memmap(self) -> bool: - """Check if using memory-mapped file (vs in-memory array).""" - return self._is_memmap - - def clear(self) -> None: - """Clear loaded vectors from memory.""" - # For memmap, just delete the reference (OS will handle cleanup) - if self._is_memmap and self._binary_matrix is not None: - del self._binary_matrix - self._chunk_ids = None - self._binary_matrix = None - self._is_memmap = False - self._loaded = False diff --git a/codex-lens/build/lib/codexlens/search/chain_search.py b/codex-lens/build/lib/codexlens/search/chain_search.py deleted file mode 100644 index 9090dbca..00000000 --- a/codex-lens/build/lib/codexlens/search/chain_search.py +++ /dev/null @@ -1,3268 +0,0 @@ -"""Chain search engine for recursive multi-directory searching. - -Provides parallel search across directory hierarchies using indexed _index.db files. -Supports depth-limited traversal, result aggregation, and symbol search. -""" - -from __future__ import annotations - -from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass, field -from pathlib import Path -from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING -import json -import logging -import os -import time - -from codexlens.entities import SearchResult, Symbol - -if TYPE_CHECKING: - import numpy as np - -try: - import numpy as np - NUMPY_AVAILABLE = True -except ImportError: - NUMPY_AVAILABLE = False -from codexlens.config import Config -from codexlens.storage.registry import RegistryStore, DirMapping -from codexlens.storage.dir_index import DirIndexStore, SubdirLink -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.vector_meta_store import VectorMetadataStore -from codexlens.config import VECTORS_META_DB_NAME -from codexlens.search.hybrid_search import HybridSearchEngine - - -@dataclass -class SearchOptions: - """Configuration options for chain search. - - Attributes: - depth: Maximum search depth (-1 = unlimited, 0 = current dir only) - max_workers: Number of parallel worker threads - limit_per_dir: Maximum results per directory - total_limit: Total result limit across all directories - offset: Pagination offset - skip first N results (default 0) - include_symbols: Whether to include symbol search results - files_only: Return only file paths without excerpts - include_semantic: Whether to include semantic keyword search results - code_only: Only return code files (excludes md, txt, json, yaml, xml, etc.) - exclude_extensions: List of file extensions to exclude (e.g., ["md", "txt", "json"]) - hybrid_mode: Enable hybrid search with RRF fusion (default False) - enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True) - enable_vector: Enable vector semantic search (default False) - pure_vector: If True, only use vector search without FTS fallback (default False) - enable_splade: Enable SPLADE sparse neural search (default False) - enable_cascade: Enable cascade (binary+dense) two-stage retrieval (default False) - hybrid_weights: Custom RRF weights for hybrid search (optional) - group_results: Enable grouping of similar results (default False) - grouping_threshold: Score threshold for grouping similar results (default 0.01) - """ - depth: int = -1 - max_workers: int = 8 - limit_per_dir: int = 10 - total_limit: int = 100 - offset: int = 0 - include_symbols: bool = False - files_only: bool = False - include_semantic: bool = False - code_only: bool = False - exclude_extensions: Optional[List[str]] = None - hybrid_mode: bool = False - enable_fuzzy: bool = True - enable_vector: bool = False - pure_vector: bool = False - enable_splade: bool = False - enable_cascade: bool = False - hybrid_weights: Optional[Dict[str, float]] = None - group_results: bool = False - grouping_threshold: float = 0.01 - - -@dataclass -class SearchStats: - """Statistics collected during search execution. - - Attributes: - dirs_searched: Number of directories searched - files_matched: Number of files with matches - time_ms: Total search time in milliseconds - errors: List of error messages encountered - """ - dirs_searched: int = 0 - files_matched: int = 0 - time_ms: float = 0 - errors: List[str] = field(default_factory=list) - - -@dataclass -class ChainSearchResult: - """Comprehensive search result with metadata. - - Attributes: - query: Original search query - results: List of SearchResult objects - related_results: Expanded results from graph neighbors (optional) - symbols: List of Symbol objects (if include_symbols=True) - stats: SearchStats with execution metrics - """ - query: str - results: List[SearchResult] - symbols: List[Symbol] - stats: SearchStats - related_results: List[SearchResult] = field(default_factory=list) - - -@dataclass -class ReferenceResult: - """Result from reference search in code_relationships table. - - Attributes: - file_path: Path to the file containing the reference - line: Line number where the reference occurs (1-based) - column: Column number where the reference occurs (0-based) - context: Surrounding code snippet for context - relationship_type: Type of relationship (call, import, inheritance, etc.) - """ - file_path: str - line: int - column: int - context: str - relationship_type: str - - -class ChainSearchEngine: - """Parallel chain search engine for hierarchical directory indexes. - - Searches across multiple directory indexes in parallel, following subdirectory - links to recursively traverse the file tree. Supports depth limits, result - aggregation, and both content and symbol searches. - - Thread-safe with configurable parallelism. - - Attributes: - registry: Global project registry - mapper: Path mapping utility - logger: Python logger instance - """ - - def __init__(self, - registry: RegistryStore, - mapper: PathMapper, - max_workers: int = 8, - config: Config | None = None): - """Initialize chain search engine. - - Args: - registry: Global project registry for path lookups - mapper: Path mapper for source/index conversions - max_workers: Maximum parallel workers (default 8) - """ - self.registry = registry - self.mapper = mapper - self.logger = logging.getLogger(__name__) - self._max_workers = max_workers - self._executor: Optional[ThreadPoolExecutor] = None - self._config = config - - def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor: - """Get or create the shared thread pool executor. - - Lazy initialization to avoid creating executor if never used. - - Args: - max_workers: Override default max_workers if specified - - Returns: - ThreadPoolExecutor instance - """ - workers = max_workers or self._max_workers - if self._executor is None: - self._executor = ThreadPoolExecutor(max_workers=workers) - return self._executor - - def close(self) -> None: - """Shutdown the thread pool executor.""" - if self._executor is not None: - self._executor.shutdown(wait=True) - self._executor = None - - def __enter__(self) -> "ChainSearchEngine": - """Context manager entry.""" - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - """Context manager exit.""" - self.close() - - def search(self, query: str, - source_path: Path, - options: Optional[SearchOptions] = None) -> ChainSearchResult: - """Execute chain search from source_path with recursive traversal. - - Process: - 1. Locate starting index for source_path - 2. Collect all child indexes based on depth limit - 3. Search indexes in parallel using ThreadPoolExecutor - 4. Aggregate, deduplicate, and rank results - - Args: - query: FTS5 search query string - source_path: Starting directory path - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with results, symbols, and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> result = engine.search("authentication", Path("D:/project/src")) - >>> for r in result.results[:5]: - ... print(f"{r.path}: {r.score:.2f}") - """ - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths to search - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 3: Parallel search - results, search_stats = self._search_parallel( - index_paths, query, options - ) - stats.errors = search_stats.errors - - # Step 3.5: Filter by extension if requested - if options.code_only or options.exclude_extensions: - results = self._filter_by_extension( - results, options.code_only, options.exclude_extensions - ) - - # Step 4: Merge and rank - final_results = self._merge_and_rank(results, options.total_limit, options.offset) - - # Step 5: Optional grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - - # Optional: Symbol search - symbols = [] - if options.include_symbols: - symbols = self._search_symbols_parallel( - index_paths, query, None, options.total_limit - ) - - # Optional: graph expansion using precomputed neighbors - related_results: List[SearchResult] = [] - if self._config is not None and getattr(self._config, "enable_graph_expansion", False): - try: - from codexlens.search.enrichment import SearchEnrichmentPipeline - - pipeline = SearchEnrichmentPipeline(self.mapper, config=self._config) - related_results = pipeline.expand_related_results(final_results) - except Exception as exc: - self.logger.debug("Graph expansion failed: %s", exc) - related_results = [] - - stats.time_ms = (time.time() - start_time) * 1000 - - return ChainSearchResult( - query=query, - results=final_results, - symbols=symbols, - stats=stats, - related_results=related_results, - ) - - def hybrid_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute two-stage cascade search with hybrid coarse retrieval and cross-encoder reranking. - - Hybrid cascade search process: - 1. Stage 1 (Coarse): Fast retrieval using RRF fusion of FTS + SPLADE + Vector - to get coarse_k candidates - 2. Stage 2 (Fine): CrossEncoder reranking of candidates to get final k results - - This approach balances recall (from broad coarse search) with precision - (from expensive but accurate cross-encoder scoring). - - Note: This method is the original hybrid approach. For binary vector cascade, - use binary_cascade_search() instead. - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.hybrid_cascade_search( - ... "how to authenticate users", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Stage 1: Coarse retrieval with hybrid search (FTS + SPLADE + Vector) - # Use hybrid mode for multi-signal retrieval - coarse_options = SearchOptions( - depth=options.depth, - max_workers=1, # Single thread for GPU safety - limit_per_dir=max(coarse_k // len(index_paths), 20), - total_limit=coarse_k, - hybrid_mode=True, - enable_fuzzy=options.enable_fuzzy, - enable_vector=True, # Enable vector for semantic matching - pure_vector=False, - hybrid_weights=options.hybrid_weights, - ) - - self.logger.debug( - "Cascade Stage 1: Coarse retrieval for %d candidates", coarse_k - ) - coarse_results, search_stats = self._search_parallel( - index_paths, query, coarse_options - ) - stats.errors = search_stats.errors - - # Merge and deduplicate coarse results - coarse_merged = self._merge_and_rank(coarse_results, coarse_k) - self.logger.debug( - "Cascade Stage 1 complete: %d candidates retrieved", len(coarse_merged) - ) - - if not coarse_merged: - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Stage 2: Cross-encoder reranking - self.logger.debug( - "Cascade Stage 2: Cross-encoder reranking %d candidates to top-%d", - len(coarse_merged), - k, - ) - - final_results = self._cross_encoder_rerank(query, coarse_merged, k) - - # Optional: grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def binary_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute binary cascade search with binary coarse ranking and dense fine ranking. - - Binary cascade search process: - 1. Stage 1 (Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector) - 2. Stage 2 (Fine): Dense vector cosine similarity for precise reranking - of candidates (2048-dim float32) - - This approach leverages the speed of binary search (~100x faster) while - maintaining precision through dense vector reranking. - - Performance characteristics: - - Binary search: O(N) with SIMD-accelerated XOR + popcount - - Dense rerank: Only applied to top coarse_k candidates - - Memory: 32 bytes (binary) + 8KB (dense) per chunk - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.binary_cascade_search( - ... "how to authenticate users", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to hybrid cascade search" - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Initialize embedding backends - try: - from codexlens.indexing.embedding import ( - BinaryEmbeddingBackend, - DenseEmbeddingBackend, - ) - from codexlens.semantic.ann_index import BinaryANNIndex - except ImportError as exc: - self.logger.warning( - "Binary cascade dependencies not available: %s. " - "Falling back to hybrid cascade search.", - exc - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Stage 1: Binary vector coarse retrieval - self.logger.debug( - "Binary Cascade Stage 1: Binary coarse retrieval for %d candidates", - coarse_k, - ) - - use_gpu = True - if self._config is not None: - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - try: - binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu) - query_binary_packed = binary_backend.embed_packed([query])[0] - except Exception as exc: - self.logger.warning( - "Failed to generate binary query embedding: %s. " - "Falling back to hybrid cascade search.", - exc - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Try centralized BinarySearcher first (preferred for mmap indexes) - # The index root is the parent of the first index path - index_root = index_paths[0].parent if index_paths else None - all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path) - used_centralized = False - - if index_root: - centralized_searcher = self._get_centralized_binary_searcher(index_root) - if centralized_searcher is not None: - try: - # BinarySearcher expects dense vector, not packed binary - from codexlens.semantic.embedder import Embedder - embedder = Embedder() - query_dense = embedder.embed_to_numpy([query])[0] - - # Centralized search - returns (chunk_id, hamming_distance) tuples - results = centralized_searcher.search(query_dense, top_k=coarse_k) - for chunk_id, dist in results: - all_candidates.append((chunk_id, dist, index_root)) - used_centralized = True - self.logger.debug( - "Centralized binary search found %d candidates", len(results) - ) - except Exception as exc: - self.logger.debug( - "Centralized binary search failed: %s, falling back to per-directory", - exc - ) - centralized_searcher.clear() - - # Fallback: Search per-directory indexes with legacy BinaryANNIndex - if not used_centralized: - for index_path in index_paths: - try: - # Get or create binary index for this path (uses deprecated BinaryANNIndex) - binary_index = self._get_or_create_binary_index(index_path) - if binary_index is None or binary_index.count() == 0: - continue - - # Search binary index - ids, distances = binary_index.search(query_binary_packed, coarse_k) - for chunk_id, dist in zip(ids, distances): - all_candidates.append((chunk_id, dist, index_path)) - - except Exception as exc: - self.logger.debug( - "Binary search failed for %s: %s", index_path, exc - ) - stats.errors.append(f"Binary search failed for {index_path}: {exc}") - - if not all_candidates: - self.logger.debug("No binary candidates found, falling back to hybrid") - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Sort by Hamming distance and take top coarse_k - all_candidates.sort(key=lambda x: x[1]) - coarse_candidates = all_candidates[:coarse_k] - - self.logger.debug( - "Binary Cascade Stage 1 complete: %d candidates retrieved", - len(coarse_candidates), - ) - - # Stage 2: Dense vector fine ranking - self.logger.debug( - "Binary Cascade Stage 2: Dense reranking %d candidates to top-%d", - len(coarse_candidates), - k, - ) - - try: - dense_backend = DenseEmbeddingBackend(use_gpu=use_gpu) - query_dense = dense_backend.embed_to_numpy([query])[0] - except Exception as exc: - self.logger.warning( - "Failed to generate dense query embedding: %s. " - "Using Hamming distance scores only.", - exc - ) - # Fall back to using Hamming distance as score - return self._build_results_from_candidates( - coarse_candidates[:k], index_paths, stats, query, start_time, - use_centralized=used_centralized - ) - - # Group candidates by index path for batch retrieval - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, _, index_path in coarse_candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append(chunk_id) - - # Retrieve dense embeddings and compute cosine similarity - scored_results: List[Tuple[float, SearchResult]] = [] - import sqlite3 - - for index_path, chunk_ids in candidates_by_index.items(): - try: - # Collect valid rows and dense vectors for batch processing - valid_rows: List[Dict[str, Any]] = [] - dense_vectors: List["np.ndarray"] = [] - - if used_centralized: - # Centralized mode: index_path is actually index_root directory - # Dense embeddings are in per-directory _index.db files - # referenced by source_index_db in chunk_metadata - meta_db_path = index_path / VECTORS_META_DB_NAME - if not meta_db_path.exists(): - self.logger.debug( - "VectorMetadataStore not found at %s, skipping dense reranking", meta_db_path - ) - continue - - # Get chunk metadata with source_index_db references - meta_store = VectorMetadataStore(meta_db_path) - chunks_meta = meta_store.get_chunks_by_ids(chunk_ids) - - # Group chunks by source_index_db - chunks_by_source: Dict[str, List[Dict[str, Any]]] = {} - for chunk in chunks_meta: - source_db = chunk.get("source_index_db") - if source_db: - if source_db not in chunks_by_source: - chunks_by_source[source_db] = [] - chunks_by_source[source_db].append(chunk) - - # Retrieve dense embeddings from each source_index_db - for source_db, source_chunks in chunks_by_source.items(): - try: - source_chunk_ids = [c["chunk_id"] for c in source_chunks] - conn = sqlite3.connect(source_db) - conn.row_factory = sqlite3.Row - - placeholders = ",".join("?" * len(source_chunk_ids)) - # Try semantic_chunks first (newer schema), fall back to chunks - try: - rows = conn.execute( - f"SELECT id, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", - source_chunk_ids - ).fetchall() - except sqlite3.OperationalError: - rows = conn.execute( - f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})", - source_chunk_ids - ).fetchall() - conn.close() - - # Build dense vector lookup - dense_lookup = {row["id"]: row["embedding_dense"] for row in rows} - - # Process chunks with their embeddings - for chunk in source_chunks: - chunk_id = chunk["chunk_id"] - dense_bytes = dense_lookup.get(chunk_id) - if dense_bytes is not None: - valid_rows.append({ - "id": chunk_id, - "file_path": chunk["file_path"], - "content": chunk["content"], - }) - dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) - except Exception as exc: - self.logger.debug( - "Failed to get dense embeddings from %s: %s", source_db, exc - ) - else: - # Per-directory mode: index_path is the _index.db file - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row - - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - conn.close() - - for row in rows: - dense_bytes = row["embedding_dense"] - if dense_bytes is not None: - valid_rows.append(dict(row)) - dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) - - # Skip if no dense embeddings found - if not dense_vectors: - continue - - # Stack into matrix for batch computation - doc_matrix = np.vstack(dense_vectors) - - # Batch compute cosine similarities - scores = self._compute_cosine_similarity_batch(query_dense, doc_matrix) - - # Create search results - for i, row in enumerate(valid_rows): - score = float(scores[i]) - excerpt = (row.get("content") or "")[:500] - result = SearchResult( - path=row.get("file_path") or "", - score=score, - excerpt=excerpt, - ) - scored_results.append((score, result)) - - except Exception as exc: - self.logger.debug( - "Dense reranking failed for %s: %s", index_path, exc - ) - stats.errors.append(f"Dense reranking failed for {index_path}: {exc}") - - # Sort by score descending and deduplicate by path - scored_results.sort(key=lambda x: x[0], reverse=True) - - path_to_result: Dict[str, SearchResult] = {} - for score, result in scored_results: - if result.path not in path_to_result: - path_to_result[result.path] = result - - final_results = list(path_to_result.values())[:k] - - # Optional: grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Binary cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - strategy: Optional[Literal["binary", "hybrid", "binary_rerank", "dense_rerank", "staged"]] = None, - ) -> ChainSearchResult: - """Unified cascade search entry point with strategy selection. - - Provides a single interface for cascade search with configurable strategy: - - "binary": Uses binary vector coarse ranking + dense fine ranking (fastest) - - "hybrid": Uses FTS+SPLADE+Vector coarse ranking + cross-encoder reranking (original) - - "binary_rerank": Uses binary vector coarse ranking + cross-encoder reranking (best balance) - - "dense_rerank": Uses dense vector coarse ranking + cross-encoder reranking - - "staged": 4-stage pipeline: binary -> LSP expand -> clustering -> optional rerank - - The strategy is determined with the following priority: - 1. The `strategy` parameter (e.g., from CLI --cascade-strategy option) - 2. Config `cascade_strategy` setting from settings.json - 3. Default: "binary" - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - strategy: Cascade strategy - "binary", "hybrid", "binary_rerank", "dense_rerank", or "staged". - - Returns: - ChainSearchResult with reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> # Use binary cascade (default, fastest) - >>> result = engine.cascade_search("auth", Path("D:/project")) - >>> # Use hybrid cascade (original behavior) - >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="hybrid") - >>> # Use binary + cross-encoder (best balance of speed and quality) - >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="binary_rerank") - >>> # Use 4-stage pipeline (binary + LSP expand + clustering + optional rerank) - >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="staged") - """ - # Strategy priority: parameter > config > default - effective_strategy = strategy - valid_strategies = ("binary", "hybrid", "binary_rerank", "dense_rerank", "staged") - if effective_strategy is None: - # Not passed via parameter, check config - if self._config is not None: - config_strategy = getattr(self._config, "cascade_strategy", None) - if config_strategy in valid_strategies: - effective_strategy = config_strategy - - # If still not set, apply default - if effective_strategy not in valid_strategies: - effective_strategy = "binary" - - if effective_strategy == "binary": - return self.binary_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "binary_rerank": - return self.binary_rerank_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "dense_rerank": - return self.dense_rerank_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "staged": - return self.staged_cascade_search(query, source_path, k, coarse_k, options) - else: - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - def staged_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute 4-stage cascade search pipeline with binary, LSP expansion, clustering, and optional reranking. - - Staged cascade search process: - 1. Stage 1 (Binary Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-bit binary vectors) - 2. Stage 2 (LSP Expansion): Expand coarse candidates using GraphExpander to - include related symbols (definitions, references, callers/callees) - 3. Stage 3 (Clustering): Use configurable clustering strategy to group similar - results and select representative results from each cluster - 4. Stage 4 (Optional Rerank): If config.enable_staged_rerank is True, apply - cross-encoder reranking for final precision - - This approach combines the speed of binary search with graph-based context - expansion and diversity-preserving clustering for high-quality results. - - Performance characteristics: - - Stage 1: O(N) binary search with SIMD acceleration (~8ms) - - Stage 2: O(k * d) graph traversal where d is expansion depth - - Stage 3: O(n^2) clustering on expanded candidates - - Stage 4: Optional cross-encoder reranking (API call) - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with per-stage statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.staged_cascade_search( - ... "authentication handler", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to hybrid cascade search" - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Per-stage timing stats - stage_times: Dict[str, float] = {} - stage_counts: Dict[str, int] = {} - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # ========== Stage 1: Binary Coarse Search ========== - stage1_start = time.time() - coarse_results, index_root = self._stage1_binary_search( - query, index_paths, coarse_k, stats - ) - stage_times["stage1_binary_ms"] = (time.time() - stage1_start) * 1000 - stage_counts["stage1_candidates"] = len(coarse_results) - - self.logger.debug( - "Staged Stage 1: Binary search found %d candidates in %.2fms", - len(coarse_results), stage_times["stage1_binary_ms"] - ) - - if not coarse_results: - self.logger.debug("No binary candidates found, falling back to hybrid cascade") - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # ========== Stage 2: LSP Graph Expansion ========== - stage2_start = time.time() - expanded_results = self._stage2_lsp_expand(coarse_results, index_root) - stage_times["stage2_expand_ms"] = (time.time() - stage2_start) * 1000 - stage_counts["stage2_expanded"] = len(expanded_results) - - self.logger.debug( - "Staged Stage 2: LSP expansion %d -> %d results in %.2fms", - len(coarse_results), len(expanded_results), stage_times["stage2_expand_ms"] - ) - - # ========== Stage 3: Clustering and Representative Selection ========== - stage3_start = time.time() - clustered_results = self._stage3_cluster_prune(expanded_results, k * 2) - stage_times["stage3_cluster_ms"] = (time.time() - stage3_start) * 1000 - stage_counts["stage3_clustered"] = len(clustered_results) - - self.logger.debug( - "Staged Stage 3: Clustering %d -> %d representatives in %.2fms", - len(expanded_results), len(clustered_results), stage_times["stage3_cluster_ms"] - ) - - # ========== Stage 4: Optional Cross-Encoder Reranking ========== - enable_rerank = False - if self._config is not None: - enable_rerank = getattr(self._config, "enable_staged_rerank", False) - - if enable_rerank: - stage4_start = time.time() - final_results = self._stage4_optional_rerank(query, clustered_results, k) - stage_times["stage4_rerank_ms"] = (time.time() - stage4_start) * 1000 - stage_counts["stage4_reranked"] = len(final_results) - - self.logger.debug( - "Staged Stage 4: Reranking %d -> %d results in %.2fms", - len(clustered_results), len(final_results), stage_times["stage4_rerank_ms"] - ) - else: - # Skip reranking, just take top-k by score - final_results = sorted( - clustered_results, key=lambda r: r.score, reverse=True - )[:k] - stage_counts["stage4_reranked"] = len(final_results) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in final_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = list(path_to_result.values())[:k] - - # Optional: grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - # Add per-stage stats to errors field (as JSON for now, will be proper field later) - stage_stats_json = json.dumps({ - "stage_times": stage_times, - "stage_counts": stage_counts, - }) - stats.errors.append(f"STAGE_STATS:{stage_stats_json}") - - self.logger.debug( - "Staged cascade search complete: %d results in %.2fms " - "(stage1=%.1fms, stage2=%.1fms, stage3=%.1fms)", - len(final_results), - stats.time_ms, - stage_times.get("stage1_binary_ms", 0), - stage_times.get("stage2_expand_ms", 0), - stage_times.get("stage3_cluster_ms", 0), - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _stage1_binary_search( - self, - query: str, - index_paths: List[Path], - coarse_k: int, - stats: SearchStats, - ) -> Tuple[List[SearchResult], Optional[Path]]: - """Stage 1: Binary vector coarse search using Hamming distance. - - Reuses the binary coarse search logic from binary_cascade_search. - - Args: - query: Search query string - index_paths: List of index database paths to search - coarse_k: Number of coarse candidates to retrieve - stats: SearchStats to update with errors - - Returns: - Tuple of (list of SearchResult objects, index_root path or None) - """ - # Initialize binary embedding backend - try: - from codexlens.indexing.embedding import BinaryEmbeddingBackend - except ImportError as exc: - self.logger.warning( - "BinaryEmbeddingBackend not available: %s", exc - ) - return [], None - - # Try centralized BinarySearcher first (preferred for mmap indexes) - index_root = index_paths[0].parent if index_paths else None - coarse_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path) - used_centralized = False - - if index_root: - binary_searcher = self._get_centralized_binary_searcher(index_root) - if binary_searcher is not None: - try: - from codexlens.semantic.embedder import Embedder - embedder = Embedder() - query_dense = embedder.embed_to_numpy([query])[0] - - results = binary_searcher.search(query_dense, top_k=coarse_k) - for chunk_id, distance in results: - coarse_candidates.append((chunk_id, distance, index_root)) - if coarse_candidates: - used_centralized = True - self.logger.debug( - "Stage 1 centralized binary search: %d candidates", len(results) - ) - except Exception as exc: - self.logger.debug(f"Centralized binary search failed: {exc}") - - if not used_centralized: - # Fallback to per-directory binary indexes - use_gpu = True - if self._config is not None: - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - try: - binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu) - query_binary = binary_backend.embed_packed([query])[0] - except Exception as exc: - self.logger.warning(f"Failed to generate binary query embedding: {exc}") - return [], index_root - - for index_path in index_paths: - try: - binary_index = self._get_or_create_binary_index(index_path) - if binary_index is None or binary_index.count() == 0: - continue - ids, distances = binary_index.search(query_binary, coarse_k) - for chunk_id, dist in zip(ids, distances): - coarse_candidates.append((chunk_id, dist, index_path)) - except Exception as exc: - self.logger.debug( - "Binary search failed for %s: %s", index_path, exc - ) - - if not coarse_candidates: - return [], index_root - - # Sort by Hamming distance and take top coarse_k - coarse_candidates.sort(key=lambda x: x[1]) - coarse_candidates = coarse_candidates[:coarse_k] - - # Build SearchResult objects from candidates - coarse_results: List[SearchResult] = [] - - # Group candidates by index path for efficient retrieval - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, _, idx_path in coarse_candidates: - if idx_path not in candidates_by_index: - candidates_by_index[idx_path] = [] - candidates_by_index[idx_path].append(chunk_id) - - # Retrieve chunk content - import sqlite3 - central_meta_path = index_root / VECTORS_META_DB_NAME if index_root else None - central_meta_store = None - if central_meta_path and central_meta_path.exists(): - central_meta_store = VectorMetadataStore(central_meta_path) - - for idx_path, chunk_ids in candidates_by_index.items(): - try: - chunks_data = [] - if central_meta_store: - chunks_data = central_meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data and used_centralized: - meta_db_path = idx_path / VECTORS_META_DB_NAME - if meta_db_path.exists(): - meta_store = VectorMetadataStore(meta_db_path) - chunks_data = meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data: - try: - conn = sqlite3.connect(str(idx_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT id, file_path, content, metadata, category - FROM semantic_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "category": row["category"], - } - for row in cursor.fetchall() - ] - conn.close() - except Exception: - pass - - for chunk in chunks_data: - chunk_id = chunk.get("id") or chunk.get("chunk_id") - distance = next( - (d for cid, d, _ in coarse_candidates if cid == chunk_id), - 256 - ) - score = 1.0 - (distance / 256.0) - - content = chunk.get("content", "") - - # Extract symbol info from metadata if available - metadata = chunk.get("metadata") - symbol_name = None - symbol_kind = None - start_line = None - end_line = None - if metadata: - try: - meta_dict = json.loads(metadata) if isinstance(metadata, str) else metadata - symbol_name = meta_dict.get("symbol_name") - symbol_kind = meta_dict.get("symbol_kind") - start_line = meta_dict.get("start_line") - end_line = meta_dict.get("end_line") - except Exception: - pass - - result = SearchResult( - path=chunk.get("file_path", ""), - score=float(score), - excerpt=content[:500] if content else "", - content=content, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - start_line=start_line, - end_line=end_line, - ) - coarse_results.append(result) - except Exception as exc: - self.logger.debug( - "Failed to retrieve chunks from %s: %s", idx_path, exc - ) - stats.errors.append(f"Stage 1 chunk retrieval failed for {idx_path}: {exc}") - - return coarse_results, index_root - - def _stage2_lsp_expand( - self, - coarse_results: List[SearchResult], - index_root: Optional[Path], - ) -> List[SearchResult]: - """Stage 2: LSP-based graph expansion using GraphExpander. - - Expands coarse results with related symbols (definitions, references, - callers, callees) using precomputed graph neighbors. - - Args: - coarse_results: Results from Stage 1 binary search - index_root: Root path of the index (for graph database access) - - Returns: - Combined list of original results plus expanded related results - """ - if not coarse_results or index_root is None: - return coarse_results - - try: - from codexlens.search.graph_expander import GraphExpander - - # Get expansion depth from config - depth = 2 - if self._config is not None: - depth = getattr(self._config, "graph_expansion_depth", 2) - - expander = GraphExpander(self.mapper, config=self._config) - - # Expand top results (limit expansion to avoid explosion) - max_expand = min(10, len(coarse_results)) - max_related = 50 - - related_results = expander.expand( - coarse_results, - depth=depth, - max_expand=max_expand, - max_related=max_related, - ) - - if related_results: - self.logger.debug( - "Stage 2 expanded %d base results to %d related symbols", - len(coarse_results), len(related_results) - ) - - # Combine: original results + related results - # Keep original results first (higher relevance) - combined = list(coarse_results) - seen_keys = {(r.path, r.symbol_name, r.start_line) for r in coarse_results} - - for related in related_results: - key = (related.path, related.symbol_name, related.start_line) - if key not in seen_keys: - seen_keys.add(key) - combined.append(related) - - return combined - - except ImportError as exc: - self.logger.debug("GraphExpander not available: %s", exc) - return coarse_results - except Exception as exc: - self.logger.debug("Stage 2 LSP expansion failed: %s", exc) - return coarse_results - - def _stage3_cluster_prune( - self, - expanded_results: List[SearchResult], - target_count: int, - ) -> List[SearchResult]: - """Stage 3: Cluster expanded results and select representatives. - - Uses the extensible clustering infrastructure from codexlens.search.clustering - to group similar results and select the best representative from each cluster. - - Args: - expanded_results: Results from Stage 2 expansion - target_count: Target number of representative results - - Returns: - List of representative results (one per cluster) - """ - if not expanded_results: - return [] - - # If few results, skip clustering - if len(expanded_results) <= target_count: - return expanded_results - - try: - from codexlens.search.clustering import ( - ClusteringConfig, - get_strategy, - ) - - # Get clustering config from config - strategy_name = "auto" - min_cluster_size = 3 - - if self._config is not None: - strategy_name = getattr(self._config, "staged_clustering_strategy", "auto") - min_cluster_size = getattr(self._config, "staged_clustering_min_size", 3) - - # Get embeddings for clustering - # Try to get dense embeddings from results' content - embeddings = self._get_embeddings_for_clustering(expanded_results) - - if embeddings is None or len(embeddings) == 0: - # No embeddings available, fall back to score-based selection - self.logger.debug("No embeddings for clustering, using score-based selection") - return sorted( - expanded_results, key=lambda r: r.score, reverse=True - )[:target_count] - - # Create clustering config - config = ClusteringConfig( - min_cluster_size=min(min_cluster_size, max(2, len(expanded_results) // 5)), - min_samples=2, - metric="cosine", - ) - - # Get strategy with fallback - strategy = get_strategy(strategy_name, config, fallback=True) - - # Cluster and select representatives - representatives = strategy.fit_predict(embeddings, expanded_results) - - self.logger.debug( - "Stage 3 clustered %d results into %d representatives using %s", - len(expanded_results), len(representatives), type(strategy).__name__ - ) - - # If clustering returned too few, supplement with top-scored unclustered - if len(representatives) < target_count: - rep_paths = {r.path for r in representatives} - remaining = [r for r in expanded_results if r.path not in rep_paths] - remaining_sorted = sorted(remaining, key=lambda r: r.score, reverse=True) - representatives.extend(remaining_sorted[:target_count - len(representatives)]) - - return representatives[:target_count] - - except ImportError as exc: - self.logger.debug("Clustering not available: %s", exc) - return sorted( - expanded_results, key=lambda r: r.score, reverse=True - )[:target_count] - except Exception as exc: - self.logger.debug("Stage 3 clustering failed: %s", exc) - return sorted( - expanded_results, key=lambda r: r.score, reverse=True - )[:target_count] - - def _stage4_optional_rerank( - self, - query: str, - clustered_results: List[SearchResult], - k: int, - ) -> List[SearchResult]: - """Stage 4: Optional cross-encoder reranking. - - Applies cross-encoder reranking if enabled in config. - - Args: - query: Search query string - clustered_results: Results from Stage 3 clustering - k: Number of final results to return - - Returns: - Reranked results sorted by cross-encoder score - """ - if not clustered_results: - return [] - - # Use existing _cross_encoder_rerank method - return self._cross_encoder_rerank(query, clustered_results, k) - - def _get_embeddings_for_clustering( - self, - results: List[SearchResult], - ) -> Optional["np.ndarray"]: - """Get dense embeddings for clustering results. - - Tries to generate embeddings from result content for clustering. - - Args: - results: List of SearchResult objects - - Returns: - NumPy array of embeddings or None if not available - """ - if not NUMPY_AVAILABLE: - return None - - if not results: - return None - - try: - from codexlens.semantic.factory import get_embedder - - # Get embedding settings from config - embedding_backend = "fastembed" - embedding_model = "code" - use_gpu = True - - if self._config is not None: - embedding_backend = getattr(self._config, "embedding_backend", "fastembed") - embedding_model = getattr(self._config, "embedding_model", "code") - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - # Create embedder - if embedding_backend == "litellm": - embedder = get_embedder(backend="litellm", model=embedding_model) - else: - embedder = get_embedder(backend="fastembed", profile=embedding_model, use_gpu=use_gpu) - - # Extract text content from results - texts = [] - for result in results: - # Use content if available, otherwise use excerpt - text = result.content or result.excerpt or "" - if not text and result.path: - text = result.path - texts.append(text[:2000]) # Limit text length - - # Generate embeddings - embeddings = embedder.embed_to_numpy(texts) - return embeddings - - except ImportError as exc: - self.logger.debug("Embedder not available for clustering: %s", exc) - return None - except Exception as exc: - self.logger.debug("Failed to generate embeddings for clustering: %s", exc) - return None - - def binary_rerank_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute binary cascade search with cross-encoder reranking. - - Combines the speed of binary vector coarse search with the quality of - cross-encoder reranking for the best balance of speed and accuracy. - - Binary + Reranker cascade process: - 1. Stage 1 (Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector) - 2. Stage 2 (Fine): Cross-encoder reranking for precise semantic ranking - of candidates using query-document attention - - This approach is typically faster than hybrid_cascade_search while - achieving similar or better quality through cross-encoder reranking. - - Performance characteristics: - - Binary search: O(N) with SIMD-accelerated XOR + popcount (~8ms) - - Cross-encoder: Applied to top coarse_k candidates (~15-20s for API) - - Total: Faster coarse + high-quality fine = best balance - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with cross-encoder reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.binary_rerank_cascade_search( - ... "how to authenticate users", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to hybrid cascade search" - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Initialize binary embedding backend - try: - from codexlens.indexing.embedding import BinaryEmbeddingBackend - except ImportError as exc: - self.logger.warning( - "BinaryEmbeddingBackend not available: %s, falling back to hybrid cascade", - exc - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Step 4: Binary coarse search (same as binary_cascade_search) - binary_coarse_time = time.time() - coarse_candidates: List[Tuple[int, int, Path]] = [] - - # Try centralized BinarySearcher first (preferred for mmap indexes) - # The index root is the parent of the first index path - index_root = index_paths[0].parent if index_paths else None - used_centralized = False - - if index_root: - binary_searcher = self._get_centralized_binary_searcher(index_root) - if binary_searcher is not None: - try: - # BinarySearcher expects dense vector, not packed binary - from codexlens.semantic.embedder import Embedder - embedder = Embedder() - query_dense = embedder.embed_to_numpy([query])[0] - - results = binary_searcher.search(query_dense, top_k=coarse_k) - for chunk_id, distance in results: - coarse_candidates.append((chunk_id, distance, index_root)) - # Only mark as used if we got actual results - if coarse_candidates: - used_centralized = True - self.logger.debug( - "Binary coarse search (centralized): %d candidates in %.2fms", - len(results), (time.time() - binary_coarse_time) * 1000 - ) - except Exception as exc: - self.logger.debug(f"Centralized binary search failed: {exc}") - - if not used_centralized: - # Get GPU preference from config - use_gpu = True - if self._config is not None: - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - try: - binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu) - query_binary = binary_backend.embed_packed([query])[0] - except Exception as exc: - self.logger.warning(f"Failed to generate binary query embedding: {exc}") - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Fallback to per-directory binary indexes - for index_path in index_paths: - try: - binary_index = self._get_or_create_binary_index(index_path) - if binary_index is None or binary_index.count() == 0: - continue - # BinaryANNIndex returns (ids, distances) arrays - ids, distances = binary_index.search(query_binary, coarse_k) - for chunk_id, dist in zip(ids, distances): - coarse_candidates.append((chunk_id, dist, index_path)) - except Exception as exc: - self.logger.debug( - "Binary search failed for %s: %s", index_path, exc - ) - - if not coarse_candidates: - self.logger.info("No binary candidates found, falling back to hybrid cascade for reranking") - # Fall back to hybrid_cascade_search which uses FTS+Vector coarse + cross-encoder rerank - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Sort by Hamming distance and take top coarse_k - coarse_candidates.sort(key=lambda x: x[1]) - coarse_candidates = coarse_candidates[:coarse_k] - - self.logger.debug( - "Binary coarse search: %d candidates in %.2fms", - len(coarse_candidates), (time.time() - binary_coarse_time) * 1000 - ) - - # Step 5: Build SearchResult objects for cross-encoder reranking - # Group candidates by index path for efficient retrieval - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, distance, index_path in coarse_candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append(chunk_id) - - # Retrieve chunk content for reranking - # Always use centralized VectorMetadataStore since chunks are stored there - import sqlite3 - coarse_results: List[SearchResult] = [] - - # Find the centralized metadata store path (project root) - # index_root was computed earlier, use it for chunk retrieval - central_meta_path = index_root / VECTORS_META_DB_NAME if index_root else None - central_meta_store = None - if central_meta_path and central_meta_path.exists(): - central_meta_store = VectorMetadataStore(central_meta_path) - - for index_path, chunk_ids in candidates_by_index.items(): - try: - chunks_data = [] - if central_meta_store: - # Try centralized VectorMetadataStore first (preferred) - chunks_data = central_meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data and used_centralized: - # Fallback to per-index-path meta store - meta_db_path = index_path / VECTORS_META_DB_NAME - if meta_db_path.exists(): - meta_store = VectorMetadataStore(meta_db_path) - chunks_data = meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data: - # Final fallback: query semantic_chunks table directly - # This handles per-directory indexes with semantic_chunks table - try: - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT id, file_path, content, metadata, category - FROM semantic_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "category": row["category"], - } - for row in cursor.fetchall() - ] - conn.close() - except Exception: - pass # Skip if table doesn't exist - - for chunk in chunks_data: - # Find the Hamming distance for this chunk - chunk_id = chunk.get("id") or chunk.get("chunk_id") - distance = next( - (d for cid, d, _ in coarse_candidates if cid == chunk_id), - 256 - ) - # Initial score from Hamming distance (will be replaced by reranker) - score = 1.0 - (distance / 256.0) - - content = chunk.get("content", "") - result = SearchResult( - path=chunk.get("file_path", ""), - score=float(score), - excerpt=content[:500] if content else "", - content=content, - ) - coarse_results.append(result) - except Exception as exc: - self.logger.debug( - "Failed to retrieve chunks from %s: %s", index_path, exc - ) - - if not coarse_results: - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, results=[], symbols=[], stats=stats - ) - - self.logger.debug( - "Retrieved %d chunks for cross-encoder reranking", len(coarse_results) - ) - - # Step 6: Cross-encoder reranking (same as hybrid_cascade_search) - rerank_time = time.time() - reranked_results = self._cross_encoder_rerank(query, coarse_results, top_k=k) - - self.logger.debug( - "Cross-encoder reranking: %d results in %.2fms", - len(reranked_results), (time.time() - rerank_time) * 1000 - ) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in reranked_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = list(path_to_result.values())[:k] - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Binary+Rerank cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def dense_rerank_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute dense cascade search with cross-encoder reranking. - - Combines dense vector coarse search (HNSW) with cross-encoder reranking - for comparison with binary_rerank strategy. - - Dense + Reranker cascade process: - 1. Stage 1 (Coarse): Dense vector search using HNSW (cosine similarity) - to get coarse_k candidates (2048-dim float32) - 2. Stage 2 (Fine): Cross-encoder reranking for precise semantic ranking - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with cross-encoder reranked results and statistics - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to hybrid cascade search" - ) - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 3: Find centralized HNSW index and read model config - from codexlens.config import VECTORS_HNSW_NAME - central_hnsw_path = None - index_root = start_index.parent - current_dir = index_root - for _ in range(10): # Limit search depth - candidate = current_dir / VECTORS_HNSW_NAME - if candidate.exists(): - central_hnsw_path = candidate - index_root = current_dir # Update to where HNSW was found - break - parent = current_dir.parent - if parent == current_dir: # Reached root - break - current_dir = parent - - # Step 4: Generate query dense embedding using same model as centralized index - # Read embedding config to match the model used during indexing - dense_coarse_time = time.time() - try: - from codexlens.semantic.factory import get_embedder - - # Get embedding settings from centralized index config (preferred) or fallback to self._config - embedding_backend = "litellm" # Default to API for dense - embedding_model = "qwen3-embedding-sf" # Default model - use_gpu = True - - # Try to read model config from centralized index's embeddings_config table - central_index_db = index_root / "_index.db" - if central_index_db.exists(): - try: - from codexlens.semantic.vector_store import VectorStore - with VectorStore(central_index_db) as vs: - model_config = vs.get_model_config() - if model_config: - embedding_backend = model_config.get("backend", embedding_backend) - embedding_model = model_config.get("model_name", embedding_model) - self.logger.debug( - "Read model config from centralized index: %s/%s", - embedding_backend, embedding_model - ) - except Exception as e: - self.logger.debug("Failed to read centralized model config: %s", e) - - # Fallback to self._config if not read from index - if self._config is not None: - if embedding_backend == "litellm" and embedding_model == "qwen3-embedding-sf": - # Only use config values if we didn't read from centralized index - config_backend = getattr(self._config, "embedding_backend", None) - config_model = getattr(self._config, "embedding_model", None) - if config_backend: - embedding_backend = config_backend - if config_model: - embedding_model = config_model - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - # Create embedder matching index configuration - if embedding_backend == "litellm": - embedder = get_embedder(backend="litellm", model=embedding_model) - else: - embedder = get_embedder(backend="fastembed", profile=embedding_model, use_gpu=use_gpu) - - query_dense = embedder.embed_to_numpy([query])[0] - self.logger.debug(f"Dense query embedding: {query_dense.shape[0]}-dim via {embedding_backend}/{embedding_model}") - except Exception as exc: - self.logger.warning(f"Failed to generate dense query embedding: {exc}") - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Step 5: Dense coarse search using centralized HNSW index - coarse_candidates: List[Tuple[int, float, Path]] = [] # (chunk_id, distance, index_path) - - if central_hnsw_path is not None: - # Use centralized index - try: - from codexlens.semantic.ann_index import ANNIndex - ann_index = ANNIndex.create_central( - index_root=index_root, - dim=query_dense.shape[0], - ) - if ann_index.load() and ann_index.count() > 0: - # Search centralized HNSW index - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - for chunk_id, dist in zip(ids, distances): - coarse_candidates.append((chunk_id, dist, index_root / "_index.db")) - self.logger.debug( - "Centralized dense search: %d candidates from %s", - len(ids), central_hnsw_path - ) - except Exception as exc: - self.logger.debug( - "Centralized dense search failed for %s: %s", central_hnsw_path, exc - ) - - # Fallback: try per-directory HNSW indexes if centralized not found - if not coarse_candidates: - for index_path in index_paths: - try: - # Load HNSW index - from codexlens.semantic.ann_index import ANNIndex - ann_index = ANNIndex(index_path, dim=query_dense.shape[0]) - if not ann_index.load(): - continue - - if ann_index.count() == 0: - continue - - # Search HNSW index - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - for chunk_id, dist in zip(ids, distances): - coarse_candidates.append((chunk_id, dist, index_path)) - - except Exception as exc: - self.logger.debug( - "Dense search failed for %s: %s", index_path, exc - ) - - if not coarse_candidates: - self.logger.info("No dense candidates found, falling back to hybrid cascade") - return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) - - # Sort by distance (ascending for cosine distance) and take top coarse_k - coarse_candidates.sort(key=lambda x: x[1]) - coarse_candidates = coarse_candidates[:coarse_k] - - self.logger.debug( - "Dense coarse search: %d candidates in %.2fms", - len(coarse_candidates), (time.time() - dense_coarse_time) * 1000 - ) - - # Step 6: Build SearchResult objects for cross-encoder reranking - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, distance, index_path in coarse_candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append(chunk_id) - - # Retrieve chunk content for reranking - import sqlite3 - coarse_results: List[SearchResult] = [] - - for index_path, chunk_ids in candidates_by_index.items(): - try: - # For centralized index, use _vectors_meta.db for chunk metadata - # which contains file_path, content, start_line, end_line - if central_hnsw_path is not None and index_path == index_root / "_index.db": - # Use centralized metadata from _vectors_meta.db - meta_db_path = index_root / "_vectors_meta.db" - if meta_db_path.exists(): - conn = sqlite3.connect(str(meta_db_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT chunk_id, file_path, content, start_line, end_line - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["chunk_id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": json.dumps({ - "start_line": row["start_line"], - "end_line": row["end_line"] - }), - "category": "code" if row["file_path"].endswith(('.py', '.ts', '.js', '.java', '.go', '.rs', '.cpp', '.c')) else "doc", - } - for row in cursor.fetchall() - ] - conn.close() - else: - chunks_data = [] - else: - # Fall back to per-directory semantic_chunks table - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT id, file_path, content, metadata, category - FROM semantic_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "category": row["category"], - } - for row in cursor.fetchall() - ] - conn.close() - - for chunk in chunks_data: - chunk_id = chunk.get("id") - distance = next( - (d for cid, d, _ in coarse_candidates if cid == chunk_id), - 1.0 - ) - # Convert cosine distance to score (clamp to [0, 1] for Pydantic validation) - # Cosine distance can be > 1 for anti-correlated vectors, causing negative scores - score = max(0.0, 1.0 - distance) - - content = chunk.get("content", "") - result = SearchResult( - path=chunk.get("file_path", ""), - score=float(score), - excerpt=content[:500] if content else "", - content=content, - ) - coarse_results.append(result) - except Exception as exc: - self.logger.debug( - "Failed to retrieve chunks from %s: %s", index_path, exc - ) - - if not coarse_results: - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, results=[], symbols=[], stats=stats - ) - - self.logger.debug( - "Retrieved %d chunks for cross-encoder reranking", len(coarse_results) - ) - - # Step 6: Cross-encoder reranking - rerank_time = time.time() - reranked_results = self._cross_encoder_rerank(query, coarse_results, top_k=k) - - self.logger.debug( - "Cross-encoder reranking: %d results in %.2fms", - len(reranked_results), (time.time() - rerank_time) * 1000 - ) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in reranked_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = list(path_to_result.values())[:k] - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Dense+Rerank cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]: - """Get or create a BinaryANNIndex for the given index path. - - .. deprecated:: - This method uses the deprecated BinaryANNIndex. For centralized indexes, - use _get_centralized_binary_searcher() instead. - - Attempts to load an existing binary index from disk. If not found, - returns None (binary index should be built during indexing). - - Args: - index_path: Path to the _index.db file - - Returns: - BinaryANNIndex instance or None if not available - """ - try: - import warnings - # Suppress deprecation warning since we're using it intentionally for legacy support - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - from codexlens.semantic.ann_index import BinaryANNIndex - - binary_index = BinaryANNIndex(index_path, dim=256) - if binary_index.load(): - return binary_index - return None - except Exception as exc: - self.logger.debug("Failed to load binary index for %s: %s", index_path, exc) - return None - - def _get_centralized_binary_searcher(self, index_root: Path) -> Optional[Any]: - """Get centralized BinarySearcher for memory-mapped binary vectors. - - This is the preferred method for centralized indexes, providing faster - search via memory-mapped files. - - Args: - index_root: Root directory containing centralized index files - - Returns: - BinarySearcher instance or None if not available - """ - try: - from codexlens.search.binary_searcher import BinarySearcher - - binary_searcher = BinarySearcher(index_root) - if binary_searcher.load(): - self.logger.debug( - "Using centralized BinarySearcher with %d vectors (mmap=%s)", - binary_searcher.vector_count, - binary_searcher.is_memmap - ) - return binary_searcher - return None - except Exception as exc: - self.logger.debug("Failed to load centralized binary searcher: %s", exc) - return None - - def _compute_cosine_similarity( - self, - query_vec: "np.ndarray", - doc_vec: "np.ndarray", - ) -> float: - """Compute cosine similarity between query and document vectors. - - Args: - query_vec: Query embedding vector - doc_vec: Document embedding vector - - Returns: - Cosine similarity score in range [-1, 1] - """ - if not NUMPY_AVAILABLE: - return 0.0 - - # Ensure same shape - min_len = min(len(query_vec), len(doc_vec)) - q = query_vec[:min_len] - d = doc_vec[:min_len] - - # Compute cosine similarity - dot_product = np.dot(q, d) - norm_q = np.linalg.norm(q) - norm_d = np.linalg.norm(d) - - if norm_q == 0 or norm_d == 0: - return 0.0 - - return float(dot_product / (norm_q * norm_d)) - - def _compute_cosine_similarity_batch( - self, - query_vec: "np.ndarray", - doc_matrix: "np.ndarray", - ) -> "np.ndarray": - """Compute cosine similarity between query and multiple document vectors. - - Uses vectorized matrix operations for efficient batch computation. - - Args: - query_vec: Query embedding vector of shape (dim,) - doc_matrix: Document embeddings matrix of shape (n_docs, dim) - - Returns: - Array of cosine similarity scores of shape (n_docs,) - """ - if not NUMPY_AVAILABLE: - return np.zeros(doc_matrix.shape[0]) - - # Ensure query is 1D - if query_vec.ndim > 1: - query_vec = query_vec.flatten() - - # Handle dimension mismatch by truncating to smaller dimension - min_dim = min(len(query_vec), doc_matrix.shape[1]) - q = query_vec[:min_dim] - docs = doc_matrix[:, :min_dim] - - # Compute query norm once - norm_q = np.linalg.norm(q) - if norm_q == 0: - return np.zeros(docs.shape[0]) - - # Normalize query - q_normalized = q / norm_q - - # Compute document norms (vectorized) - doc_norms = np.linalg.norm(docs, axis=1) - - # Avoid division by zero - nonzero_mask = doc_norms > 0 - scores = np.zeros(docs.shape[0], dtype=np.float32) - - if np.any(nonzero_mask): - # Normalize documents with non-zero norms - docs_normalized = docs[nonzero_mask] / doc_norms[nonzero_mask, np.newaxis] - - # Batch dot product: (n_docs, dim) @ (dim,) = (n_docs,) - scores[nonzero_mask] = docs_normalized @ q_normalized - - return scores - - def _build_results_from_candidates( - self, - candidates: List[Tuple[int, int, Path]], - index_paths: List[Path], - stats: SearchStats, - query: str, - start_time: float, - use_centralized: bool = False, - ) -> ChainSearchResult: - """Build ChainSearchResult from binary candidates using Hamming distance scores. - - Used as fallback when dense embeddings are not available. - - Args: - candidates: List of (chunk_id, hamming_distance, index_path) tuples - index_paths: List of all searched index paths - stats: SearchStats to update - query: Original query string - start_time: Search start time for timing - use_centralized: If True, index_path is the index_root directory - and VectorMetadataStore should be used instead of SQLiteStore - - Returns: - ChainSearchResult with results scored by Hamming distance - """ - results: List[SearchResult] = [] - - # Group by index path - candidates_by_index: Dict[Path, List[Tuple[int, int]]] = {} - for chunk_id, distance, index_path in candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append((chunk_id, distance)) - - for index_path, chunk_tuples in candidates_by_index.items(): - try: - chunk_ids = [c[0] for c in chunk_tuples] - - # Use VectorMetadataStore for centralized search, SQLiteStore for per-directory - if use_centralized: - # index_path is actually index_root directory for centralized search - meta_db_path = index_path / VECTORS_META_DB_NAME - if not meta_db_path.exists(): - self.logger.debug( - "VectorMetadataStore not found at %s, skipping", meta_db_path - ) - continue - meta_store = VectorMetadataStore(meta_db_path) - chunks_data = meta_store.get_chunks_by_ids(chunk_ids) - else: - store = SQLiteStore(index_path) - chunks_data = store.get_chunks_by_ids(chunk_ids) - - chunk_content: Dict[int, Dict[str, Any]] = { - c["id"]: c for c in chunks_data - } - - for chunk_id, distance in chunk_tuples: - chunk_info = chunk_content.get(chunk_id) - if chunk_info is None: - continue - - # Convert Hamming distance to score (lower distance = higher score) - # Max Hamming distance for 256-bit is 256 - score = 1.0 - (distance / 256.0) - - excerpt = chunk_info.get("content", "")[:500] - result = SearchResult( - path=chunk_info.get("file_path", ""), - score=float(score), - excerpt=excerpt, - ) - results.append(result) - - except Exception as exc: - self.logger.debug( - "Failed to build results from %s: %s", index_path, exc - ) - - # Deduplicate by path - path_to_result: Dict[str, SearchResult] = {} - for result in results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = sorted( - path_to_result.values(), - key=lambda r: r.score, - reverse=True, - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _cross_encoder_rerank( - self, - query: str, - results: List[SearchResult], - top_k: int, - ) -> List[SearchResult]: - """Rerank results using cross-encoder model. - - Args: - query: Search query string - results: Candidate results to rerank - top_k: Number of top results to return - - Returns: - Reranked results sorted by cross-encoder score - """ - if not results: - return [] - - # Try to get reranker from config or create new one - reranker = None - try: - from codexlens.semantic.reranker import ( - check_reranker_available, - get_reranker, - ) - - # Determine backend and model from config - backend = "onnx" - model_name = None - use_gpu = True - - if self._config is not None: - backend = getattr(self._config, "reranker_backend", "onnx") or "onnx" - model_name = getattr(self._config, "reranker_model", None) - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - ok, err = check_reranker_available(backend) - if not ok: - self.logger.debug("Reranker backend unavailable (%s): %s", backend, err) - return results[:top_k] - - # Create reranker - kwargs = {} - if backend == "onnx": - kwargs["use_gpu"] = use_gpu - elif backend == "api": - # Pass max_input_tokens for adaptive batching - max_tokens = getattr(self._config, "reranker_max_input_tokens", None) - if max_tokens: - kwargs["max_input_tokens"] = max_tokens - - reranker = get_reranker(backend=backend, model_name=model_name, **kwargs) - - except ImportError as exc: - self.logger.debug("Reranker not available: %s", exc) - return results[:top_k] - except Exception as exc: - self.logger.debug("Failed to initialize reranker: %s", exc) - return results[:top_k] - - # Use cross_encoder_rerank from ranking module - from codexlens.search.ranking import cross_encoder_rerank - - # Get chunk_type weights and test_file_penalty from config - chunk_type_weights = None - test_file_penalty = 0.0 - - if self._config is not None: - chunk_type_weights = getattr(self._config, "reranker_chunk_type_weights", None) - test_file_penalty = getattr(self._config, "reranker_test_file_penalty", 0.0) - - return cross_encoder_rerank( - query=query, - results=results, - reranker=reranker, - top_k=top_k, - batch_size=32, - chunk_type_weights=chunk_type_weights, - test_file_penalty=test_file_penalty, - ) - - def search_files_only(self, query: str, - source_path: Path, - options: Optional[SearchOptions] = None) -> List[str]: - """Search and return only matching file paths. - - Faster than full search when excerpts are not needed. - - Args: - query: FTS5 search query string - source_path: Starting directory path - options: Search configuration (uses defaults if None) - - Returns: - List of file paths as strings - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> paths = engine.search_files_only("TODO", Path("D:/project")) - >>> print(f"Found {len(paths)} files with TODOs") - """ - options = options or SearchOptions() - options.files_only = True - - result = self.search(query, source_path, options) - return [r.path for r in result.results] - - def search_symbols(self, name: str, - source_path: Path, - kind: Optional[str] = None, - options: Optional[SearchOptions] = None) -> List[Symbol]: - """Chain symbol search across directory hierarchy. - - Args: - name: Symbol name pattern (partial match supported) - source_path: Starting directory path - kind: Optional symbol kind filter (e.g., 'function', 'class') - options: Search configuration (uses defaults if None) - - Returns: - List of Symbol objects sorted by name - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function") - >>> for sym in funcs[:10]: - ... print(f"{sym.name} ({sym.kind}): lines {sym.range}") - """ - options = options or SearchOptions() - - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - return [] - - # Fast path: project-wide global symbol index (avoids chain traversal). - if self._config is None or getattr(self._config, "global_symbol_index_enabled", True): - try: - # Avoid relying on index_to_source() here; use the same logic as _find_start_index - # to determine the effective search root directory. - search_root = source_path.resolve() - exact_index = self.mapper.source_to_index_db(search_root) - if not exact_index.exists(): - nearest = self.registry.find_nearest_index(search_root) - if nearest: - search_root = nearest.source_path - - project = self.registry.find_by_source_path(str(search_root)) - if project: - global_db_path = Path(project["index_root"]) / GlobalSymbolIndex.DEFAULT_DB_NAME - if global_db_path.exists(): - query_limit = max(int(options.total_limit) * 10, int(options.total_limit)) - with GlobalSymbolIndex(global_db_path, project_id=int(project["id"])) as global_index: - candidates = global_index.search(name=name, kind=kind, limit=query_limit) - - # Apply depth constraint relative to the start index directory. - filtered: List[Symbol] = [] - for sym in candidates: - if not sym.file: - continue - try: - root_str = str(search_root) - file_dir_str = str(Path(sym.file).parent) - - # Normalize Windows long-path prefix (\\?\) if present. - if root_str.startswith("\\\\?\\"): - root_str = root_str[4:] - if file_dir_str.startswith("\\\\?\\"): - file_dir_str = file_dir_str[4:] - - root_cmp = root_str.lower().rstrip("\\/") - dir_cmp = file_dir_str.lower().rstrip("\\/") - - # Guard against Windows cross-drive comparisons (ValueError). - if os.name == "nt": - root_drive, _ = os.path.splitdrive(root_cmp) - dir_drive, _ = os.path.splitdrive(dir_cmp) - if root_drive and dir_drive and root_drive != dir_drive: - self.logger.debug( - "Skipping symbol due to cross-drive path (root=%s file=%s name=%s)", - root_cmp, - sym.file, - sym.name, - ) - continue - - if os.path.commonpath([root_cmp, dir_cmp]) != root_cmp: - continue - - rel = os.path.relpath(dir_cmp, root_cmp) - rel_depth = 0 if rel == "." else len(rel.split(os.sep)) - except ValueError as exc: - self.logger.debug( - "Skipping symbol due to path operation failure (root=%s file=%s name=%s): %s", - str(search_root), - sym.file, - sym.name, - exc, - ) - continue - except Exception as exc: - self.logger.debug( - "Skipping symbol due to unexpected path error (root=%s file=%s name=%s): %s", - str(search_root), - sym.file, - sym.name, - exc, - ) - continue - - if options.depth >= 0 and rel_depth > options.depth: - continue - filtered.append(sym) - - if filtered: - # Match existing semantics: dedupe by (name, kind, range), sort by name. - seen = set() - unique_symbols: List[Symbol] = [] - for sym in filtered: - key = (sym.name, sym.kind, sym.range) - if key in seen: - continue - seen.add(key) - unique_symbols.append(sym) - unique_symbols.sort(key=lambda s: s.name) - return unique_symbols[: options.total_limit] - except Exception as exc: - self.logger.debug("Global symbol index fast path failed: %s", exc) - - index_paths = self._collect_index_paths(start_index, options.depth) - if not index_paths: - return [] - - return self._search_symbols_parallel( - index_paths, name, kind, options.total_limit - ) - - def search_references( - self, - symbol_name: str, - source_path: Optional[Path] = None, - depth: int = -1, - limit: int = 100, - ) -> List[ReferenceResult]: - """Find all references to a symbol across the project. - - Searches the code_relationships table in all index databases to find - where the given symbol is referenced (called, imported, inherited, etc.). - - Args: - symbol_name: Fully qualified or simple name of the symbol to find references to - source_path: Starting path for search (default: workspace root from registry) - depth: Search depth (-1 = unlimited, 0 = current dir only) - limit: Maximum results to return (default 100) - - Returns: - List of ReferenceResult objects sorted by file path and line number - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> refs = engine.search_references("authenticate", Path("D:/project/src")) - >>> for ref in refs[:10]: - ... print(f"{ref.file_path}:{ref.line} ({ref.relationship_type})") - """ - import sqlite3 - from concurrent.futures import as_completed - - # Determine starting path - if source_path is None: - # Try to get workspace root from registry - mappings = self.registry.list_mappings() - if mappings: - source_path = Path(mappings[0].source_path) - else: - self.logger.warning("No source path provided and no mappings in registry") - return [] - - # Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - return [] - - # Collect all index paths - index_paths = self._collect_index_paths(start_index, depth) - if not index_paths: - self.logger.debug(f"No indexes collected from {start_index}") - return [] - - self.logger.debug( - "Searching %d indexes for references to '%s'", - len(index_paths), symbol_name - ) - - # Search in parallel - all_results: List[ReferenceResult] = [] - executor = self._get_executor() - - def search_single_index(index_path: Path) -> List[ReferenceResult]: - """Search a single index for references.""" - results: List[ReferenceResult] = [] - try: - conn = sqlite3.connect(str(index_path), check_same_thread=False) - conn.row_factory = sqlite3.Row - - # Query code_relationships for references to this symbol - # Match either target_qualified_name containing the symbol name - # or an exact match on the last component - # Try full_path first (new schema), fallback to path (old schema) - try: - rows = conn.execute( - """ - SELECT DISTINCT - f.full_path as source_file, - cr.source_line, - cr.relationship_type, - f.content - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name = ? - ORDER BY f.full_path, cr.source_line - LIMIT ? - """, - ( - f"%{symbol_name}", # Ends with symbol name - f"%.{symbol_name}", # Qualified name ending with .symbol_name - symbol_name, # Exact match - limit, - ) - ).fetchall() - except sqlite3.OperationalError: - # Fallback for old schema with 'path' column - rows = conn.execute( - """ - SELECT DISTINCT - f.path as source_file, - cr.source_line, - cr.relationship_type, - f.content - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name = ? - ORDER BY f.path, cr.source_line - LIMIT ? - """, - ( - f"%{symbol_name}", # Ends with symbol name - f"%.{symbol_name}", # Qualified name ending with .symbol_name - symbol_name, # Exact match - limit, - ) - ).fetchall() - - for row in rows: - file_path = row["source_file"] - line = row["source_line"] or 1 - rel_type = row["relationship_type"] - content = row["content"] or "" - - # Extract context (3 lines around reference) - context = self._extract_context(content, line, context_lines=3) - - results.append(ReferenceResult( - file_path=file_path, - line=line, - column=0, # Column info not stored in code_relationships - context=context, - relationship_type=rel_type, - )) - - conn.close() - except sqlite3.DatabaseError as exc: - self.logger.debug( - "Failed to search references in %s: %s", index_path, exc - ) - except Exception as exc: - self.logger.debug( - "Unexpected error searching references in %s: %s", index_path, exc - ) - - return results - - # Submit parallel searches - futures = { - executor.submit(search_single_index, idx_path): idx_path - for idx_path in index_paths - } - - for future in as_completed(futures): - try: - results = future.result() - all_results.extend(results) - except Exception as exc: - idx_path = futures[future] - self.logger.debug( - "Reference search failed for %s: %s", idx_path, exc - ) - - # Deduplicate by (file_path, line) - seen: set = set() - unique_results: List[ReferenceResult] = [] - for ref in all_results: - key = (ref.file_path, ref.line) - if key not in seen: - seen.add(key) - unique_results.append(ref) - - # Sort by file path and line - unique_results.sort(key=lambda r: (r.file_path, r.line)) - - # Apply limit - return unique_results[:limit] - - def _extract_context( - self, - content: str, - line: int, - context_lines: int = 3 - ) -> str: - """Extract lines around a given line number from file content. - - Args: - content: Full file content - line: Target line number (1-based) - context_lines: Number of lines to include before and after - - Returns: - Context snippet as a string - """ - if not content: - return "" - - lines = content.splitlines() - total_lines = len(lines) - - if line < 1 or line > total_lines: - return "" - - # Calculate range (0-indexed internally) - start = max(0, line - 1 - context_lines) - end = min(total_lines, line + context_lines) - - context = lines[start:end] - return "\n".join(context) - - # === Internal Methods === - - def _find_start_index(self, source_path: Path) -> Optional[Path]: - """Find index database path for source directory. - - Attempts exact match first, then searches for nearest ancestor index. - - Args: - source_path: Source directory path - - Returns: - Path to _index.db file, or None if not found - """ - source_path = source_path.resolve() - - # Try exact match first - exact_index = self.mapper.source_to_index_db(source_path) - if exact_index.exists(): - self.logger.debug(f"Found exact index: {exact_index}") - return exact_index - - # Try nearest ancestor via registry - nearest = self.registry.find_nearest_index(source_path) - if nearest: - self.logger.debug(f"Found nearest index: {nearest.index_path}") - return nearest.index_path - - self.logger.warning(f"No index found for {source_path}") - return None - - def _collect_index_paths(self, start_index: Path, - depth: int) -> List[Path]: - """Recursively collect all subdirectory index paths. - - Traverses directory tree via subdirs table in each _index.db, - respecting depth limit. - - Args: - start_index: Starting _index.db path - depth: Maximum depth (-1 = unlimited, 0 = current only) - - Returns: - List of _index.db paths to search - """ - collected = [] - visited = set() - - def _collect_recursive(index_path: Path, current_depth: int): - # Normalize path to avoid duplicates - normalized = index_path.resolve() - if normalized in visited: - return - visited.add(normalized) - - # Add current index - if normalized.exists(): - collected.append(normalized) - else: - self.logger.debug(f"Index does not exist: {normalized}") - return - - # Check depth limit - if depth >= 0 and current_depth >= depth: - return - - # Read subdirs and recurse - try: - with DirIndexStore(normalized) as store: - subdirs = store.get_subdirs() - for subdir in subdirs: - _collect_recursive(subdir.index_path, current_depth + 1) - except Exception as exc: - self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}") - - _collect_recursive(start_index, 0) - self.logger.info(f"Collected {len(collected)} indexes (depth={depth})") - return collected - - def _search_parallel(self, index_paths: List[Path], - query: str, - options: SearchOptions) -> tuple[List[SearchResult], SearchStats]: - """Search multiple indexes in parallel using shared ThreadPoolExecutor. - - Args: - index_paths: List of _index.db paths to search - query: FTS5 query string - options: Search configuration - - Returns: - Tuple of (all results, search statistics) - """ - all_results = [] - stats = SearchStats() - - # Force single-threaded execution for vector/hybrid search to avoid GPU crashes - # DirectML/ONNX have threading issues when multiple threads access GPU resources - effective_workers = options.max_workers - if options.enable_vector or options.hybrid_mode: - effective_workers = 1 - self.logger.debug("Using single-threaded mode for vector search (GPU safety)") - # Pre-load embedder to avoid initialization overhead per-search - try: - from codexlens.semantic.embedder import get_embedder - get_embedder(profile="code", use_gpu=True) - except Exception: - pass # Ignore pre-load failures - - executor = self._get_executor(effective_workers) - # Submit all search tasks - future_to_path = { - executor.submit( - self._search_single_index, - idx_path, - query, - options.limit_per_dir, - options.files_only, - options.include_semantic, - options.hybrid_mode, - options.enable_fuzzy, - options.enable_vector, - options.pure_vector, - options.enable_splade, - options.hybrid_weights - ): idx_path - for idx_path in index_paths - } - - # Collect results as they complete - for future in as_completed(future_to_path): - idx_path = future_to_path[future] - try: - results = future.result() - all_results.extend(results) - self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}") - except Exception as exc: - error_msg = f"Search failed for {idx_path}: {exc}" - self.logger.error(error_msg) - stats.errors.append(error_msg) - - return all_results, stats - - def _search_single_index(self, index_path: Path, - query: str, - limit: int, - files_only: bool = False, - include_semantic: bool = False, - hybrid_mode: bool = False, - enable_fuzzy: bool = True, - enable_vector: bool = False, - pure_vector: bool = False, - enable_splade: bool = False, - hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]: - """Search a single index database. - - Handles exceptions gracefully, returning empty list on failure. - - Args: - index_path: Path to _index.db file - query: FTS5 query string (for FTS) or natural language query (for vector) - limit: Maximum results from this index - files_only: If True, skip snippet generation for faster search - include_semantic: If True, also search semantic keywords and merge results - hybrid_mode: If True, use hybrid search with RRF fusion - enable_fuzzy: Enable fuzzy FTS in hybrid mode - enable_vector: Enable vector semantic search - pure_vector: If True, only use vector search without FTS fallback - enable_splade: If True, force SPLADE sparse neural search - hybrid_weights: Custom RRF weights for hybrid search - - Returns: - List of SearchResult objects (empty on error) - """ - try: - # Use hybrid search if enabled - if hybrid_mode: - hybrid_engine = HybridSearchEngine(weights=hybrid_weights) - fts_results = hybrid_engine.search( - index_path, - query, - limit=limit, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - pure_vector=pure_vector, - enable_splade=enable_splade, - ) - else: - # Single-FTS search (exact or fuzzy mode) - with DirIndexStore(index_path) as store: - # Get FTS results - if files_only: - # Fast path: return paths only without snippets - paths = store.search_files_only(query, limit=limit) - fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths] - else: - # Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS - if enable_fuzzy: - fts_results = store.search_fts_fuzzy( - query, limit=limit, return_full_content=True - ) - else: - fts_results = store.search_fts_exact( - query, limit=limit, return_full_content=True - ) - - # Optionally add semantic keyword results - if include_semantic: - try: - semantic_matches = store.search_semantic_keywords(query) - # Convert semantic matches to SearchResult with 0.8x weight - for file_entry, keywords in semantic_matches: - # Create excerpt from keywords - excerpt = f"Keywords: {', '.join(keywords[:5])}" - # Use a base score of 10.0 for semantic matches, weighted by 0.8 - semantic_result = SearchResult( - path=str(file_entry.full_path), - score=10.0 * 0.8, - excerpt=excerpt - ) - fts_results.append(semantic_result) - except Exception as sem_exc: - self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}") - - return fts_results - except Exception as exc: - self.logger.debug(f"Search error in {index_path}: {exc}") - return [] - - def _filter_by_extension(self, results: List[SearchResult], - code_only: bool = False, - exclude_extensions: Optional[List[str]] = None) -> List[SearchResult]: - """Filter search results by file extension. - - Args: - results: Search results to filter - code_only: If True, exclude non-code files (md, txt, json, yaml, xml, etc.) - exclude_extensions: List of extensions to exclude (e.g., ["md", "txt"]) - - Returns: - Filtered results - """ - # Non-code file extensions (same as MCP tool smart-search.ts) - NON_CODE_EXTENSIONS = { - 'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log', - 'ini', 'cfg', 'conf', 'toml', 'env', 'properties', - 'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp', - 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', - 'lock', 'sum', 'mod', - } - - # Build exclusion set - excluded_exts = set() - if exclude_extensions: - # Normalize extensions (remove leading dots, lowercase) - excluded_exts = {ext.lower().lstrip('.') for ext in exclude_extensions} - if code_only: - excluded_exts.update(NON_CODE_EXTENSIONS) - - if not excluded_exts: - return results - - # Filter results - filtered = [] - for result in results: - path_str = result.path - if not path_str: - continue - - # Extract extension from path - if '.' in path_str: - ext = path_str.rsplit('.', 1)[-1].lower() - if ext in excluded_exts: - continue # Skip this result - - filtered.append(result) - - return filtered - - def _merge_and_rank(self, results: List[SearchResult], - limit: int, offset: int = 0) -> List[SearchResult]: - """Aggregate, deduplicate, and rank results. - - Process: - 1. Deduplicate by path (keep highest score) - 2. Sort by score descending - 3. Apply offset and limit for pagination - - Args: - results: Raw results from all indexes - limit: Maximum results to return - offset: Number of results to skip (pagination offset) - - Returns: - Deduplicated and ranked results with pagination - """ - # Deduplicate by path, keeping best score - path_to_result: Dict[str, SearchResult] = {} - for result in results: - path = result.path - if path not in path_to_result or result.score > path_to_result[path].score: - path_to_result[path] = result - - # Sort by score descending - unique_results = list(path_to_result.values()) - unique_results.sort(key=lambda r: r.score, reverse=True) - - # Apply offset and limit for pagination - return unique_results[offset:offset + limit] - - def _search_symbols_parallel(self, index_paths: List[Path], - name: str, - kind: Optional[str], - limit: int) -> List[Symbol]: - """Search symbols across multiple indexes in parallel. - - Args: - index_paths: List of _index.db paths to search - name: Symbol name pattern - kind: Optional symbol kind filter - limit: Total symbol limit - - Returns: - Deduplicated and sorted symbols - """ - all_symbols = [] - - executor = self._get_executor() - # Submit all symbol search tasks - future_to_path = { - executor.submit( - self._search_symbols_single, - idx_path, - name, - kind - ): idx_path - for idx_path in index_paths - } - - # Collect results - for future in as_completed(future_to_path): - try: - symbols = future.result() - all_symbols.extend(symbols) - except Exception as exc: - self.logger.error(f"Symbol search failed: {exc}") - - # Deduplicate by (name, kind, range) - seen = set() - unique_symbols = [] - for sym in all_symbols: - key = (sym.name, sym.kind, sym.range) - if key not in seen: - seen.add(key) - unique_symbols.append(sym) - - # Sort by name - unique_symbols.sort(key=lambda s: s.name) - - return unique_symbols[:limit] - - def _search_symbols_single(self, index_path: Path, - name: str, - kind: Optional[str]) -> List[Symbol]: - """Search symbols in a single index. - - Args: - index_path: Path to _index.db file - name: Symbol name pattern - kind: Optional symbol kind filter - - Returns: - List of Symbol objects (empty on error) - """ - try: - with DirIndexStore(index_path) as store: - return store.search_symbols(name, kind=kind) - except Exception as exc: - self.logger.debug(f"Symbol search error in {index_path}: {exc}") - return [] - - -# === Convenience Functions === - -def quick_search(query: str, - source_path: Path, - depth: int = -1) -> List[SearchResult]: - """Quick search convenience function with automatic initialization. - - Creates temporary registry and mapper instances for one-off searches. - For repeated searches, create a ChainSearchEngine instance directly. - - Args: - query: FTS5 search query string - source_path: Starting directory path - depth: Maximum search depth (-1 = unlimited) - - Returns: - List of SearchResult objects sorted by relevance - - Examples: - >>> from pathlib import Path - >>> results = quick_search("authentication", Path("D:/project/src")) - >>> print(f"Found {len(results)} matches") - """ - registry = RegistryStore() - registry.initialize() - - mapper = PathMapper() - - with ChainSearchEngine(registry, mapper) as engine: - options = SearchOptions(depth=depth) - result = engine.search(query, source_path, options) - - registry.close() - - return result.results diff --git a/codex-lens/build/lib/codexlens/search/clustering/__init__.py b/codex-lens/build/lib/codexlens/search/clustering/__init__.py deleted file mode 100644 index d8161c98..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/__init__.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Clustering strategies for the staged hybrid search pipeline. - -This module provides extensible clustering infrastructure for grouping -similar search results and selecting representative results. - -Install with: pip install codexlens[clustering] - -Example: - >>> from codexlens.search.clustering import ( - ... CLUSTERING_AVAILABLE, - ... ClusteringConfig, - ... get_strategy, - ... ) - >>> config = ClusteringConfig(min_cluster_size=3) - >>> # Auto-select best available strategy with fallback - >>> strategy = get_strategy("auto", config) - >>> representatives = strategy.fit_predict(embeddings, results) - >>> - >>> # Or explicitly use a specific strategy - >>> if CLUSTERING_AVAILABLE: - ... from codexlens.search.clustering import HDBSCANStrategy - ... strategy = HDBSCANStrategy(config) - ... representatives = strategy.fit_predict(embeddings, results) -""" - -from __future__ import annotations - -# Always export base classes and factory (no heavy dependencies) -from .base import BaseClusteringStrategy, ClusteringConfig -from .factory import ( - ClusteringStrategyFactory, - check_clustering_strategy_available, - get_strategy, -) -from .noop_strategy import NoOpStrategy -from .frequency_strategy import FrequencyStrategy, FrequencyConfig - -# Feature flag for clustering availability (hdbscan + sklearn) -CLUSTERING_AVAILABLE = False -HDBSCAN_AVAILABLE = False -DBSCAN_AVAILABLE = False -_import_error: str | None = None - - -def _detect_clustering_available() -> tuple[bool, bool, bool, str | None]: - """Detect if clustering dependencies are available. - - Returns: - Tuple of (all_available, hdbscan_available, dbscan_available, error_message). - """ - hdbscan_ok = False - dbscan_ok = False - - try: - import hdbscan # noqa: F401 - hdbscan_ok = True - except ImportError: - pass - - try: - from sklearn.cluster import DBSCAN # noqa: F401 - dbscan_ok = True - except ImportError: - pass - - all_ok = hdbscan_ok and dbscan_ok - error = None - if not all_ok: - missing = [] - if not hdbscan_ok: - missing.append("hdbscan") - if not dbscan_ok: - missing.append("scikit-learn") - error = f"{', '.join(missing)} not available. Install with: pip install codexlens[clustering]" - - return all_ok, hdbscan_ok, dbscan_ok, error - - -# Initialize on module load -CLUSTERING_AVAILABLE, HDBSCAN_AVAILABLE, DBSCAN_AVAILABLE, _import_error = ( - _detect_clustering_available() -) - - -def check_clustering_available() -> tuple[bool, str | None]: - """Check if all clustering dependencies are available. - - Returns: - Tuple of (is_available, error_message). - error_message is None if available, otherwise contains install instructions. - """ - return CLUSTERING_AVAILABLE, _import_error - - -# Conditionally export strategy implementations -__all__ = [ - # Feature flags - "CLUSTERING_AVAILABLE", - "HDBSCAN_AVAILABLE", - "DBSCAN_AVAILABLE", - "check_clustering_available", - # Base classes - "BaseClusteringStrategy", - "ClusteringConfig", - # Factory - "ClusteringStrategyFactory", - "get_strategy", - "check_clustering_strategy_available", - # Always-available strategies - "NoOpStrategy", - "FrequencyStrategy", - "FrequencyConfig", -] - -# Conditionally add strategy classes to __all__ and module namespace -if HDBSCAN_AVAILABLE: - from .hdbscan_strategy import HDBSCANStrategy - - __all__.append("HDBSCANStrategy") - -if DBSCAN_AVAILABLE: - from .dbscan_strategy import DBSCANStrategy - - __all__.append("DBSCANStrategy") diff --git a/codex-lens/build/lib/codexlens/search/clustering/base.py b/codex-lens/build/lib/codexlens/search/clustering/base.py deleted file mode 100644 index 912a4fc6..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/base.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Base classes for clustering strategies in the hybrid search pipeline. - -This module defines the abstract base class for clustering strategies used -in the staged hybrid search pipeline. Strategies cluster search results -based on their embeddings and select representative results from each cluster. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, List, Optional - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -@dataclass -class ClusteringConfig: - """Configuration parameters for clustering strategies. - - Attributes: - min_cluster_size: Minimum number of results to form a cluster. - HDBSCAN default is 5, but for search results 2-3 is often better. - min_samples: Number of samples in a neighborhood for a point to be - considered a core point. Lower values allow more clusters. - metric: Distance metric for clustering. Common options: - - 'euclidean': Standard L2 distance - - 'cosine': Cosine distance (1 - cosine_similarity) - - 'manhattan': L1 distance - cluster_selection_epsilon: Distance threshold for cluster selection. - Results within this distance may be merged into the same cluster. - allow_single_cluster: If True, allow all results to form one cluster. - Useful when results are very similar. - prediction_data: If True, generate prediction data for new points. - """ - - min_cluster_size: int = 3 - min_samples: int = 2 - metric: str = "cosine" - cluster_selection_epsilon: float = 0.0 - allow_single_cluster: bool = True - prediction_data: bool = False - - def __post_init__(self) -> None: - """Validate configuration parameters.""" - if self.min_cluster_size < 2: - raise ValueError("min_cluster_size must be >= 2") - if self.min_samples < 1: - raise ValueError("min_samples must be >= 1") - if self.metric not in ("euclidean", "cosine", "manhattan"): - raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}") - if self.cluster_selection_epsilon < 0: - raise ValueError("cluster_selection_epsilon must be >= 0") - - -class BaseClusteringStrategy(ABC): - """Abstract base class for clustering strategies. - - Clustering strategies are used in the staged hybrid search pipeline to - group similar search results and select representative results from each - cluster, reducing redundancy while maintaining diversity. - - Subclasses must implement: - - cluster(): Group results into clusters based on embeddings - - select_representatives(): Choose best result(s) from each cluster - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize the clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - """ - self.config = config or ClusteringConfig() - - @abstractmethod - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results based on their embeddings. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - Used for additional metadata during clustering. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Results not assigned to any cluster - (noise points) should be returned as single-element clusters. - - Example: - >>> strategy = HDBSCANStrategy() - >>> clusters = strategy.cluster(embeddings, results) - >>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]] - >>> # Result indices 0, 2, 5 are in cluster 0 - >>> # Result indices 1, 3 are in cluster 1 - >>> # Result index 4 is a noise point (singleton cluster) - >>> # Result indices 6, 7, 8 are in cluster 2 - """ - ... - - @abstractmethod - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - This method chooses the best result(s) from each cluster to include - in the final search results. The selection can be based on: - - Highest score within cluster - - Closest to cluster centroid - - Custom selection logic - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings array for centroid-based selection. - - Returns: - List of representative SearchResult objects, one or more per cluster, - ordered by relevance (highest score first). - - Example: - >>> representatives = strategy.select_representatives(clusters, results) - >>> # Returns best result from each cluster - """ - ... - - def fit_predict( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List["SearchResult"]: - """Convenience method to cluster and select representatives in one call. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - results: List of SearchResult objects. - - Returns: - List of representative SearchResult objects. - """ - clusters = self.cluster(embeddings, results) - return self.select_representatives(clusters, results, embeddings) diff --git a/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py b/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py deleted file mode 100644 index 90588a91..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py +++ /dev/null @@ -1,197 +0,0 @@ -"""DBSCAN-based clustering strategy for search results. - -DBSCAN (Density-Based Spatial Clustering of Applications with Noise) -is the fallback clustering strategy when HDBSCAN is not available. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class DBSCANStrategy(BaseClusteringStrategy): - """DBSCAN-based clustering strategy. - - Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available. - DBSCAN requires an explicit eps parameter, which is auto-computed from the - distance distribution if not provided. - - Example: - >>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine') - >>> strategy = DBSCANStrategy(config) - >>> clusters = strategy.cluster(embeddings, results) - >>> representatives = strategy.select_representatives(clusters, results) - """ - - # Default eps percentile for auto-computation - DEFAULT_EPS_PERCENTILE: float = 15.0 - - def __init__( - self, - config: Optional[ClusteringConfig] = None, - eps: Optional[float] = None, - eps_percentile: float = DEFAULT_EPS_PERCENTILE, - ) -> None: - """Initialize DBSCAN clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - eps: Explicit eps parameter for DBSCAN. If None, auto-computed - from the distance distribution. - eps_percentile: Percentile of pairwise distances to use for - auto-computing eps. Default is 15th percentile. - - Raises: - ImportError: If sklearn is not installed. - """ - super().__init__(config) - self.eps = eps - self.eps_percentile = eps_percentile - - # Validate sklearn is available - try: - from sklearn.cluster import DBSCAN # noqa: F401 - except ImportError as exc: - raise ImportError( - "scikit-learn package is required for DBSCANStrategy. " - "Install with: pip install codexlens[clustering]" - ) from exc - - def _compute_eps(self, embeddings: "np.ndarray") -> float: - """Auto-compute eps from pairwise distance distribution. - - Uses the specified percentile of pairwise distances as eps, - which typically captures local density well. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - - Returns: - Computed eps value. - """ - import numpy as np - from sklearn.metrics import pairwise_distances - - # Compute pairwise distances - distances = pairwise_distances(embeddings, metric=self.config.metric) - - # Get upper triangle (excluding diagonal) - upper_tri = distances[np.triu_indices_from(distances, k=1)] - - if len(upper_tri) == 0: - # Only one point, return a default small eps - return 0.1 - - # Use percentile of distances as eps - eps = float(np.percentile(upper_tri, self.eps_percentile)) - - # Ensure eps is positive - return max(eps, 1e-6) - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results using DBSCAN algorithm. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Noise points are returned as singleton clusters. - """ - from sklearn.cluster import DBSCAN - import numpy as np - - n_results = len(results) - if n_results == 0: - return [] - - # Handle edge case: single result - if n_results == 1: - return [[0]] - - # Determine eps value - eps = self.eps if self.eps is not None else self._compute_eps(embeddings) - - # Configure DBSCAN clusterer - # Note: DBSCAN min_samples corresponds to min_cluster_size concept - clusterer = DBSCAN( - eps=eps, - min_samples=self.config.min_samples, - metric=self.config.metric, - ) - - # Fit and get cluster labels - # Labels: -1 = noise, 0+ = cluster index - labels = clusterer.fit_predict(embeddings) - - # Group indices by cluster label - cluster_map: dict[int, list[int]] = {} - for idx, label in enumerate(labels): - if label not in cluster_map: - cluster_map[label] = [] - cluster_map[label].append(idx) - - # Build result: non-noise clusters first, then noise as singletons - clusters: List[List[int]] = [] - - # Add proper clusters (label >= 0) - for label in sorted(cluster_map.keys()): - if label >= 0: - clusters.append(cluster_map[label]) - - # Add noise points as singleton clusters (label == -1) - if -1 in cluster_map: - for idx in cluster_map[-1]: - clusters.append([idx]) - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - Selects the result with the highest score from each cluster. - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used in score-based selection). - - Returns: - List of representative SearchResult objects, one per cluster, - ordered by score (highest first). - """ - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - # Find the result with the highest score in this cluster - best_idx = max(cluster_indices, key=lambda i: results[i].score) - representatives.append(results[best_idx]) - - # Sort by score descending - representatives.sort(key=lambda r: r.score, reverse=True) - - return representatives diff --git a/codex-lens/build/lib/codexlens/search/clustering/factory.py b/codex-lens/build/lib/codexlens/search/clustering/factory.py deleted file mode 100644 index 6c7f5b6e..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/factory.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Factory for creating clustering strategies. - -Provides a unified interface for instantiating different clustering backends -with automatic fallback chain: hdbscan -> dbscan -> noop. -""" - -from __future__ import annotations - -from typing import Any, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig -from .noop_strategy import NoOpStrategy - - -def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]: - """Check whether a specific clustering strategy can be used. - - Args: - strategy: Strategy name to check. Options: - - "hdbscan": HDBSCAN clustering (requires hdbscan package) - - "dbscan": DBSCAN clustering (requires sklearn) - - "frequency": Frequency-based clustering (always available) - - "noop": No-op strategy (always available) - - Returns: - Tuple of (is_available, error_message). - error_message is None if available, otherwise contains install instructions. - """ - strategy = (strategy or "").strip().lower() - - if strategy == "hdbscan": - try: - import hdbscan # noqa: F401 - except ImportError: - return False, ( - "hdbscan package not available. " - "Install with: pip install codexlens[clustering]" - ) - return True, None - - if strategy == "dbscan": - try: - from sklearn.cluster import DBSCAN # noqa: F401 - except ImportError: - return False, ( - "scikit-learn package not available. " - "Install with: pip install codexlens[clustering]" - ) - return True, None - - if strategy == "frequency": - # Frequency strategy is always available (no external deps) - return True, None - - if strategy == "noop": - return True, None - - return False, ( - f"Invalid clustering strategy: {strategy}. " - "Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'." - ) - - -def get_strategy( - strategy: str = "hdbscan", - config: Optional[ClusteringConfig] = None, - *, - fallback: bool = True, - **kwargs: Any, -) -> BaseClusteringStrategy: - """Factory function to create clustering strategy with fallback chain. - - The fallback chain is: hdbscan -> dbscan -> frequency -> noop - - Args: - strategy: Clustering strategy to use. Options: - - "hdbscan": HDBSCAN clustering (default, recommended) - - "dbscan": DBSCAN clustering (fallback) - - "frequency": Frequency-based clustering (groups by symbol occurrence) - - "noop": No-op strategy (returns all results ungrouped) - - "auto": Try hdbscan, then dbscan, then noop - config: Clustering configuration. Uses defaults if not provided. - For frequency strategy, pass FrequencyConfig for full control. - fallback: If True (default), automatically fall back to next strategy - in the chain when primary is unavailable. If False, raise ImportError - when requested strategy is unavailable. - **kwargs: Additional strategy-specific arguments. - For DBSCANStrategy: eps, eps_percentile - For FrequencyStrategy: group_by, min_frequency, etc. - - Returns: - BaseClusteringStrategy: Configured clustering strategy instance. - - Raises: - ValueError: If strategy is not recognized. - ImportError: If required dependencies are not installed and fallback=False. - - Example: - >>> from codexlens.search.clustering import get_strategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3) - >>> # Auto-select best available strategy - >>> strategy = get_strategy("auto", config) - >>> # Explicitly use HDBSCAN (will fall back if unavailable) - >>> strategy = get_strategy("hdbscan", config) - >>> # Use frequency-based strategy - >>> from codexlens.search.clustering import FrequencyConfig - >>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol") - >>> strategy = get_strategy("frequency", freq_config) - """ - strategy = (strategy or "").strip().lower() - - # Handle "auto" - try strategies in order - if strategy == "auto": - return _get_best_available_strategy(config, **kwargs) - - if strategy == "hdbscan": - ok, err = check_clustering_strategy_available("hdbscan") - if ok: - from .hdbscan_strategy import HDBSCANStrategy - return HDBSCANStrategy(config) - - if fallback: - # Try dbscan fallback - ok_dbscan, _ = check_clustering_strategy_available("dbscan") - if ok_dbscan: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - # Final fallback to noop - return NoOpStrategy(config) - - raise ImportError(err) - - if strategy == "dbscan": - ok, err = check_clustering_strategy_available("dbscan") - if ok: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - - if fallback: - # Fallback to noop - return NoOpStrategy(config) - - raise ImportError(err) - - if strategy == "frequency": - from .frequency_strategy import FrequencyStrategy, FrequencyConfig - # If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig - if config is None or not isinstance(config, FrequencyConfig): - freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig() - else: - freq_config = config - return FrequencyStrategy(freq_config) - - if strategy == "noop": - return NoOpStrategy(config) - - raise ValueError( - f"Unknown clustering strategy: {strategy}. " - "Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'" - ) - - -def _get_best_available_strategy( - config: Optional[ClusteringConfig] = None, - **kwargs: Any, -) -> BaseClusteringStrategy: - """Get the best available clustering strategy. - - Tries strategies in order: hdbscan -> dbscan -> noop - - Args: - config: Clustering configuration. - **kwargs: Additional strategy-specific arguments. - - Returns: - Best available clustering strategy instance. - """ - # Try HDBSCAN first - ok, _ = check_clustering_strategy_available("hdbscan") - if ok: - from .hdbscan_strategy import HDBSCANStrategy - return HDBSCANStrategy(config) - - # Try DBSCAN second - ok, _ = check_clustering_strategy_available("dbscan") - if ok: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - - # Fallback to NoOp - return NoOpStrategy(config) - - -# Alias for backward compatibility -ClusteringStrategyFactory = type( - "ClusteringStrategyFactory", - (), - { - "get_strategy": staticmethod(get_strategy), - "check_available": staticmethod(check_clustering_strategy_available), - }, -) diff --git a/codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py b/codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py deleted file mode 100644 index 48ddb00b..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Frequency-based clustering strategy for search result deduplication. - -This strategy groups search results by symbol/method name and prunes based on -occurrence frequency. High-frequency symbols (frequently referenced methods) -are considered more important and retained, while low-frequency results -(potentially noise) can be filtered out. - -Use cases: -- Prioritize commonly called methods/functions -- Filter out one-off results that may be less relevant -- Deduplicate results pointing to the same symbol from different locations -""" - -from __future__ import annotations - -from collections import defaultdict -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Literal - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -@dataclass -class FrequencyConfig(ClusteringConfig): - """Configuration for frequency-based clustering strategy. - - Attributes: - group_by: Field to group results by for frequency counting. - - 'symbol': Group by symbol_name (default, for method/function dedup) - - 'file': Group by file path - - 'symbol_kind': Group by symbol type (function, class, etc.) - min_frequency: Minimum occurrence count to keep a result. - Results appearing less than this are considered noise and pruned. - max_representatives_per_group: Maximum results to keep per symbol group. - frequency_weight: How much to boost score based on frequency. - Final score = original_score * (1 + frequency_weight * log(frequency)) - keep_mode: How to handle low-frequency results. - - 'filter': Remove results below min_frequency - - 'demote': Keep but lower their score ranking - """ - - group_by: Literal["symbol", "file", "symbol_kind"] = "symbol" - min_frequency: int = 1 # 1 means keep all, 2+ filters singletons - max_representatives_per_group: int = 3 - frequency_weight: float = 0.1 # Boost factor for frequency - keep_mode: Literal["filter", "demote"] = "demote" - - def __post_init__(self) -> None: - """Validate configuration parameters.""" - # Skip parent validation since we don't use HDBSCAN params - if self.min_frequency < 1: - raise ValueError("min_frequency must be >= 1") - if self.max_representatives_per_group < 1: - raise ValueError("max_representatives_per_group must be >= 1") - if self.frequency_weight < 0: - raise ValueError("frequency_weight must be >= 0") - if self.group_by not in ("symbol", "file", "symbol_kind"): - raise ValueError(f"group_by must be one of: symbol, file, symbol_kind; got {self.group_by}") - if self.keep_mode not in ("filter", "demote"): - raise ValueError(f"keep_mode must be one of: filter, demote; got {self.keep_mode}") - - -class FrequencyStrategy(BaseClusteringStrategy): - """Frequency-based clustering strategy for search result deduplication. - - This strategy groups search results by symbol name (or file/kind) and: - 1. Counts how many times each symbol appears in results - 2. Higher frequency = more important (frequently referenced method) - 3. Filters or demotes low-frequency results - 4. Selects top representatives from each frequency group - - Unlike embedding-based strategies (HDBSCAN, DBSCAN), this strategy: - - Does NOT require embeddings (works with metadata only) - - Is very fast (O(n) complexity) - - Is deterministic (no random initialization) - - Works well for symbol-level deduplication - - Example: - >>> config = FrequencyConfig(min_frequency=2, group_by="symbol") - >>> strategy = FrequencyStrategy(config) - >>> # Results with symbol "authenticate" appearing 5 times - >>> # will be prioritized over "helper_func" appearing once - >>> representatives = strategy.fit_predict(embeddings, results) - """ - - def __init__(self, config: Optional[FrequencyConfig] = None) -> None: - """Initialize the frequency strategy. - - Args: - config: Frequency configuration. Uses defaults if not provided. - """ - self.config: FrequencyConfig = config or FrequencyConfig() - - def _get_group_key(self, result: "SearchResult") -> str: - """Extract grouping key from a search result. - - Args: - result: SearchResult to extract key from. - - Returns: - String key for grouping (symbol name, file path, or kind). - """ - if self.config.group_by == "symbol": - # Use symbol_name if available, otherwise fall back to file:line - symbol = getattr(result, "symbol_name", None) - if symbol: - return str(symbol) - # Fallback: use file path + start_line as pseudo-symbol - start_line = getattr(result, "start_line", 0) or 0 - return f"{result.path}:{start_line}" - - elif self.config.group_by == "file": - return str(result.path) - - elif self.config.group_by == "symbol_kind": - kind = getattr(result, "symbol_kind", None) - return str(kind) if kind else "unknown" - - return str(result.path) # Default fallback - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Group search results by frequency of occurrence. - - Note: This method ignores embeddings and groups by metadata only. - The embeddings parameter is kept for interface compatibility. - - Args: - embeddings: Ignored (kept for interface compatibility). - results: List of SearchResult objects to cluster. - - Returns: - List of clusters (groups), where each cluster contains indices - of results with the same grouping key. Clusters are ordered by - frequency (highest frequency first). - """ - if not results: - return [] - - # Group results by key - groups: Dict[str, List[int]] = defaultdict(list) - for idx, result in enumerate(results): - key = self._get_group_key(result) - groups[key].append(idx) - - # Sort groups by frequency (descending) then by key (for stability) - sorted_groups = sorted( - groups.items(), - key=lambda x: (-len(x[1]), x[0]) # -frequency, then alphabetical - ) - - # Convert to list of clusters - clusters = [indices for _, indices in sorted_groups] - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results based on frequency and score. - - For each frequency group: - 1. If frequency < min_frequency: filter or demote based on keep_mode - 2. Sort by score within group - 3. Apply frequency boost to scores - 4. Select top N representatives - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (used for tie-breaking if provided). - - Returns: - List of representative SearchResult objects, ordered by - frequency-adjusted score (highest first). - """ - import math - - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - demoted: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - frequency = len(cluster_indices) - - # Get results in this cluster, sorted by score - cluster_results = [results[i] for i in cluster_indices] - cluster_results.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True) - - # Check frequency threshold - if frequency < self.config.min_frequency: - if self.config.keep_mode == "filter": - # Skip low-frequency results entirely - continue - else: # demote mode - # Keep but add to demoted list (lower priority) - for result in cluster_results[: self.config.max_representatives_per_group]: - demoted.append(result) - continue - - # Apply frequency boost and select top representatives - for result in cluster_results[: self.config.max_representatives_per_group]: - # Calculate frequency-boosted score - original_score = getattr(result, "score", 0.0) - # log(frequency + 1) to handle frequency=1 case smoothly - frequency_boost = 1.0 + self.config.frequency_weight * math.log(frequency + 1) - boosted_score = original_score * frequency_boost - - # Create new result with boosted score and frequency metadata - # Note: SearchResult might be immutable, so we preserve original - # and track boosted score in metadata - if hasattr(result, "metadata") and isinstance(result.metadata, dict): - result.metadata["frequency"] = frequency - result.metadata["frequency_boosted_score"] = boosted_score - - representatives.append(result) - - # Sort representatives by boosted score (or original score as fallback) - def get_sort_score(r: "SearchResult") -> float: - if hasattr(r, "metadata") and isinstance(r.metadata, dict): - return r.metadata.get("frequency_boosted_score", getattr(r, "score", 0.0)) - return getattr(r, "score", 0.0) - - representatives.sort(key=get_sort_score, reverse=True) - - # Add demoted results at the end - if demoted: - demoted.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True) - representatives.extend(demoted) - - return representatives - - def fit_predict( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List["SearchResult"]: - """Convenience method to cluster and select representatives in one call. - - Args: - embeddings: NumPy array (may be ignored for frequency-based clustering). - results: List of SearchResult objects. - - Returns: - List of representative SearchResult objects. - """ - clusters = self.cluster(embeddings, results) - return self.select_representatives(clusters, results, embeddings) diff --git a/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py b/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py deleted file mode 100644 index 45e8cdd0..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py +++ /dev/null @@ -1,153 +0,0 @@ -"""HDBSCAN-based clustering strategy for search results. - -HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) -is the primary clustering strategy for grouping similar search results. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class HDBSCANStrategy(BaseClusteringStrategy): - """HDBSCAN-based clustering strategy. - - Uses HDBSCAN algorithm to cluster search results based on embedding similarity. - HDBSCAN is preferred over DBSCAN because it: - - Automatically determines the number of clusters - - Handles varying density clusters well - - Identifies noise points (outliers) effectively - - Example: - >>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine') - >>> strategy = HDBSCANStrategy(config) - >>> clusters = strategy.cluster(embeddings, results) - >>> representatives = strategy.select_representatives(clusters, results) - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize HDBSCAN clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - - Raises: - ImportError: If hdbscan package is not installed. - """ - super().__init__(config) - # Validate hdbscan is available - try: - import hdbscan # noqa: F401 - except ImportError as exc: - raise ImportError( - "hdbscan package is required for HDBSCANStrategy. " - "Install with: pip install codexlens[clustering]" - ) from exc - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results using HDBSCAN algorithm. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Noise points are returned as singleton clusters. - """ - import hdbscan - import numpy as np - - n_results = len(results) - if n_results == 0: - return [] - - # Handle edge case: fewer results than min_cluster_size - if n_results < self.config.min_cluster_size: - # Return each result as its own singleton cluster - return [[i] for i in range(n_results)] - - # Configure HDBSCAN clusterer - clusterer = hdbscan.HDBSCAN( - min_cluster_size=self.config.min_cluster_size, - min_samples=self.config.min_samples, - metric=self.config.metric, - cluster_selection_epsilon=self.config.cluster_selection_epsilon, - allow_single_cluster=self.config.allow_single_cluster, - prediction_data=self.config.prediction_data, - ) - - # Fit and get cluster labels - # Labels: -1 = noise, 0+ = cluster index - labels = clusterer.fit_predict(embeddings) - - # Group indices by cluster label - cluster_map: dict[int, list[int]] = {} - for idx, label in enumerate(labels): - if label not in cluster_map: - cluster_map[label] = [] - cluster_map[label].append(idx) - - # Build result: non-noise clusters first, then noise as singletons - clusters: List[List[int]] = [] - - # Add proper clusters (label >= 0) - for label in sorted(cluster_map.keys()): - if label >= 0: - clusters.append(cluster_map[label]) - - # Add noise points as singleton clusters (label == -1) - if -1 in cluster_map: - for idx in cluster_map[-1]: - clusters.append([idx]) - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - Selects the result with the highest score from each cluster. - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used in score-based selection). - - Returns: - List of representative SearchResult objects, one per cluster, - ordered by score (highest first). - """ - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - # Find the result with the highest score in this cluster - best_idx = max(cluster_indices, key=lambda i: results[i].score) - representatives.append(results[best_idx]) - - # Sort by score descending - representatives.sort(key=lambda r: r.score, reverse=True) - - return representatives diff --git a/codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py b/codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py deleted file mode 100644 index eda36098..00000000 --- a/codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py +++ /dev/null @@ -1,83 +0,0 @@ -"""No-op clustering strategy for search results. - -NoOpStrategy returns all results ungrouped when clustering dependencies -are not available or clustering is disabled. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class NoOpStrategy(BaseClusteringStrategy): - """No-op clustering strategy that returns all results ungrouped. - - This strategy is used as a final fallback when no clustering dependencies - are available, or when clustering is explicitly disabled. Each result - is treated as its own singleton cluster. - - Example: - >>> from codexlens.search.clustering import NoOpStrategy - >>> strategy = NoOpStrategy() - >>> clusters = strategy.cluster(embeddings, results) - >>> # Returns [[0], [1], [2], ...] - each result in its own cluster - >>> representatives = strategy.select_representatives(clusters, results) - >>> # Returns all results sorted by score - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize NoOp clustering strategy. - - Args: - config: Clustering configuration. Ignored for NoOpStrategy - but accepted for interface compatibility. - """ - super().__init__(config) - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Return each result as its own singleton cluster. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - Not used but accepted for interface compatibility. - results: List of SearchResult objects. - - Returns: - List of singleton clusters, one per result. - """ - return [[i] for i in range(len(results))] - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Return all results sorted by score. - - Since each cluster is a singleton, this effectively returns all - results sorted by score descending. - - Args: - clusters: List of singleton clusters. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used). - - Returns: - All SearchResult objects sorted by score (highest first). - """ - if not results: - return [] - - # Return all results sorted by score - return sorted(results, key=lambda r: r.score, reverse=True) diff --git a/codex-lens/build/lib/codexlens/search/enrichment.py b/codex-lens/build/lib/codexlens/search/enrichment.py deleted file mode 100644 index 110f56b7..00000000 --- a/codex-lens/build/lib/codexlens/search/enrichment.py +++ /dev/null @@ -1,171 +0,0 @@ -# codex-lens/src/codexlens/search/enrichment.py -"""Relationship enrichment for search results.""" -import sqlite3 -from pathlib import Path -from typing import List, Dict, Any, Optional - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.graph_expander import GraphExpander -from codexlens.storage.path_mapper import PathMapper - - -class RelationshipEnricher: - """Enriches search results with code graph relationships.""" - - def __init__(self, index_path: Path): - """Initialize with path to index database. - - Args: - index_path: Path to _index.db SQLite database - """ - self.index_path = index_path - self.db_conn: Optional[sqlite3.Connection] = None - self._connect() - - def _connect(self) -> None: - """Establish read-only database connection.""" - if self.index_path.exists(): - self.db_conn = sqlite3.connect( - f"file:{self.index_path}?mode=ro", - uri=True, - check_same_thread=False - ) - self.db_conn.row_factory = sqlite3.Row - - def enrich(self, results: List[Dict[str, Any]], limit: int = 10) -> List[Dict[str, Any]]: - """Add relationship data to search results. - - Args: - results: List of search result dictionaries - limit: Maximum number of results to enrich - - Returns: - Results with relationships field added - """ - if not self.db_conn: - return results - - for result in results[:limit]: - file_path = result.get('file') or result.get('path') - symbol_name = result.get('symbol') - result['relationships'] = self._find_relationships(file_path, symbol_name) - return results - - def _find_relationships(self, file_path: Optional[str], symbol_name: Optional[str]) -> List[Dict[str, Any]]: - """Query relationships for a symbol. - - Args: - file_path: Path to file containing the symbol - symbol_name: Name of the symbol - - Returns: - List of relationship dictionaries with type, direction, target/source, file, line - """ - if not self.db_conn or not symbol_name: - return [] - - relationships = [] - cursor = self.db_conn.cursor() - - try: - # Find symbol ID(s) by name and optionally file - if file_path: - cursor.execute( - 'SELECT id FROM symbols WHERE name = ? AND file_path = ?', - (symbol_name, file_path) - ) - else: - cursor.execute('SELECT id FROM symbols WHERE name = ?', (symbol_name,)) - - symbol_ids = [row[0] for row in cursor.fetchall()] - - if not symbol_ids: - return [] - - # Query outgoing relationships (symbol is source) - placeholders = ','.join('?' * len(symbol_ids)) - cursor.execute(f''' - SELECT sr.relationship_type, sr.target_symbol_fqn, sr.file_path, sr.line - FROM symbol_relationships sr - WHERE sr.source_symbol_id IN ({placeholders}) - ''', symbol_ids) - - for row in cursor.fetchall(): - relationships.append({ - 'type': row[0], - 'direction': 'outgoing', - 'target': row[1], - 'file': row[2], - 'line': row[3], - }) - - # Query incoming relationships (symbol is target) - # Match against symbol name or qualified name patterns - cursor.execute(''' - SELECT sr.relationship_type, s.name AS source_name, sr.file_path, sr.line - FROM symbol_relationships sr - JOIN symbols s ON sr.source_symbol_id = s.id - WHERE sr.target_symbol_fqn = ? OR sr.target_symbol_fqn LIKE ? - ''', (symbol_name, f'%.{symbol_name}')) - - for row in cursor.fetchall(): - rel_type = row[0] - # Convert to incoming type - incoming_type = self._to_incoming_type(rel_type) - relationships.append({ - 'type': incoming_type, - 'direction': 'incoming', - 'source': row[1], - 'file': row[2], - 'line': row[3], - }) - - except sqlite3.Error: - return [] - - return relationships - - def _to_incoming_type(self, outgoing_type: str) -> str: - """Convert outgoing relationship type to incoming type. - - Args: - outgoing_type: The outgoing relationship type (e.g., 'calls', 'imports') - - Returns: - Corresponding incoming type (e.g., 'called_by', 'imported_by') - """ - type_map = { - 'calls': 'called_by', - 'imports': 'imported_by', - 'extends': 'extended_by', - } - return type_map.get(outgoing_type, f'{outgoing_type}_by') - - def close(self) -> None: - """Close database connection.""" - if self.db_conn: - self.db_conn.close() - self.db_conn = None - - def __enter__(self) -> 'RelationshipEnricher': - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.close() - - -class SearchEnrichmentPipeline: - """Search post-processing pipeline (optional enrichments).""" - - def __init__(self, mapper: PathMapper, *, config: Optional[Config] = None) -> None: - self._config = config - self._graph_expander = GraphExpander(mapper, config=config) - - def expand_related_results(self, results: List[SearchResult]) -> List[SearchResult]: - """Expand base results with related symbols when enabled in config.""" - if self._config is None or not getattr(self._config, "enable_graph_expansion", False): - return [] - - depth = int(getattr(self._config, "graph_expansion_depth", 2) or 2) - return self._graph_expander.expand(results, depth=depth) diff --git a/codex-lens/build/lib/codexlens/search/graph_expander.py b/codex-lens/build/lib/codexlens/search/graph_expander.py deleted file mode 100644 index 73261d53..00000000 --- a/codex-lens/build/lib/codexlens/search/graph_expander.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Graph expansion for search results using precomputed neighbors. - -Expands top search results with related symbol definitions by traversing -precomputed N-hop neighbors stored in the per-directory index databases. -""" - -from __future__ import annotations - -import logging -import sqlite3 -from pathlib import Path -from typing import Dict, List, Optional, Sequence, Tuple - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.storage.path_mapper import PathMapper - -logger = logging.getLogger(__name__) - - -def _result_key(result: SearchResult) -> Tuple[str, Optional[str], Optional[int], Optional[int]]: - return (result.path, result.symbol_name, result.start_line, result.end_line) - - -def _slice_content_block(content: str, start_line: Optional[int], end_line: Optional[int]) -> Optional[str]: - if content is None: - return None - if start_line is None or end_line is None: - return None - if start_line < 1 or end_line < start_line: - return None - - lines = content.splitlines() - start_idx = max(0, start_line - 1) - end_idx = min(len(lines), end_line) - if start_idx >= len(lines): - return None - return "\n".join(lines[start_idx:end_idx]) - - -class GraphExpander: - """Expands SearchResult lists with related symbols from the code graph.""" - - def __init__(self, mapper: PathMapper, *, config: Optional[Config] = None) -> None: - self._mapper = mapper - self._config = config - self._logger = logging.getLogger(__name__) - - def expand( - self, - results: Sequence[SearchResult], - *, - depth: Optional[int] = None, - max_expand: int = 10, - max_related: int = 50, - ) -> List[SearchResult]: - """Expand top results with related symbols. - - Args: - results: Base ranked results. - depth: Maximum relationship depth to include (defaults to Config or 2). - max_expand: Only expand the top-N base results to bound cost. - max_related: Maximum related results to return. - - Returns: - A list of related SearchResult objects with relationship_depth metadata. - """ - if not results: - return [] - - configured_depth = getattr(self._config, "graph_expansion_depth", 2) if self._config else 2 - max_depth = int(depth if depth is not None else configured_depth) - if max_depth <= 0: - return [] - max_depth = min(max_depth, 2) - - expand_count = max(0, int(max_expand)) - related_limit = max(0, int(max_related)) - if expand_count == 0 or related_limit == 0: - return [] - - seen = {_result_key(r) for r in results} - related_results: List[SearchResult] = [] - conn_cache: Dict[Path, sqlite3.Connection] = {} - - try: - for base in list(results)[:expand_count]: - if len(related_results) >= related_limit: - break - - if not base.symbol_name or not base.path: - continue - - index_path = self._mapper.source_to_index_db(Path(base.path).parent) - conn = conn_cache.get(index_path) - if conn is None: - conn = self._connect_readonly(index_path) - if conn is None: - continue - conn_cache[index_path] = conn - - source_ids = self._resolve_source_symbol_ids( - conn, - file_path=base.path, - symbol_name=base.symbol_name, - symbol_kind=base.symbol_kind, - ) - if not source_ids: - continue - - for source_id in source_ids: - neighbors = self._get_neighbors(conn, source_id, max_depth=max_depth, limit=related_limit) - for neighbor_id, rel_depth in neighbors: - if len(related_results) >= related_limit: - break - row = self._get_symbol_details(conn, neighbor_id) - if row is None: - continue - - path = str(row["full_path"]) - symbol_name = str(row["name"]) - symbol_kind = str(row["kind"]) - start_line = int(row["start_line"]) if row["start_line"] is not None else None - end_line = int(row["end_line"]) if row["end_line"] is not None else None - content_block = _slice_content_block( - str(row["content"]) if row["content"] is not None else "", - start_line, - end_line, - ) - - score = float(base.score) * (0.5 ** int(rel_depth)) - candidate = SearchResult( - path=path, - score=max(0.0, score), - excerpt=None, - content=content_block, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - metadata={"relationship_depth": int(rel_depth)}, - ) - - key = _result_key(candidate) - if key in seen: - continue - seen.add(key) - related_results.append(candidate) - - finally: - for conn in conn_cache.values(): - try: - conn.close() - except Exception: - pass - - return related_results - - def _connect_readonly(self, index_path: Path) -> Optional[sqlite3.Connection]: - try: - if not index_path.exists() or index_path.stat().st_size == 0: - return None - except OSError: - return None - - try: - conn = sqlite3.connect(f"file:{index_path}?mode=ro", uri=True, check_same_thread=False) - conn.row_factory = sqlite3.Row - return conn - except Exception as exc: - self._logger.debug("GraphExpander failed to open %s: %s", index_path, exc) - return None - - def _resolve_source_symbol_ids( - self, - conn: sqlite3.Connection, - *, - file_path: str, - symbol_name: str, - symbol_kind: Optional[str], - ) -> List[int]: - try: - if symbol_kind: - rows = conn.execute( - """ - SELECT s.id - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE f.full_path = ? AND s.name = ? AND s.kind = ? - """, - (file_path, symbol_name, symbol_kind), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT s.id - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE f.full_path = ? AND s.name = ? - """, - (file_path, symbol_name), - ).fetchall() - except sqlite3.Error: - return [] - - ids: List[int] = [] - for row in rows: - try: - ids.append(int(row["id"])) - except Exception: - continue - return ids - - def _get_neighbors( - self, - conn: sqlite3.Connection, - source_symbol_id: int, - *, - max_depth: int, - limit: int, - ) -> List[Tuple[int, int]]: - try: - rows = conn.execute( - """ - SELECT neighbor_symbol_id, relationship_depth - FROM graph_neighbors - WHERE source_symbol_id = ? AND relationship_depth <= ? - ORDER BY relationship_depth ASC, neighbor_symbol_id ASC - LIMIT ? - """, - (int(source_symbol_id), int(max_depth), int(limit)), - ).fetchall() - except sqlite3.Error: - return [] - - neighbors: List[Tuple[int, int]] = [] - for row in rows: - try: - neighbors.append((int(row["neighbor_symbol_id"]), int(row["relationship_depth"]))) - except Exception: - continue - return neighbors - - def _get_symbol_details(self, conn: sqlite3.Connection, symbol_id: int) -> Optional[sqlite3.Row]: - try: - return conn.execute( - """ - SELECT - s.id, - s.name, - s.kind, - s.start_line, - s.end_line, - f.full_path, - f.content - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE s.id = ? - """, - (int(symbol_id),), - ).fetchone() - except sqlite3.Error: - return None - diff --git a/codex-lens/build/lib/codexlens/search/hybrid_search.py b/codex-lens/build/lib/codexlens/search/hybrid_search.py deleted file mode 100644 index 805b2fdb..00000000 --- a/codex-lens/build/lib/codexlens/search/hybrid_search.py +++ /dev/null @@ -1,1409 +0,0 @@ -"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion. - -Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines -results via Reciprocal Rank Fusion (RRF) algorithm. -""" - -from __future__ import annotations - -import logging -import time -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed -from contextlib import contextmanager -from pathlib import Path -from typing import Any, Dict, List, Optional - - -@contextmanager -def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG): - """Context manager for timing code blocks. - - Args: - name: Name of the operation being timed - logger: Logger instance to use - level: Logging level (default DEBUG) - """ - start = time.perf_counter() - try: - yield - finally: - elapsed_ms = (time.perf_counter() - start) * 1000 - logger.log(level, "[TIMING] %s: %.2fms", name, elapsed_ms) - -from codexlens.config import Config -from codexlens.config import VECTORS_HNSW_NAME -from codexlens.entities import SearchResult -from codexlens.search.ranking import ( - DEFAULT_WEIGHTS, - FTS_FALLBACK_WEIGHTS, - QueryIntent, - apply_symbol_boost, - cross_encoder_rerank, - detect_query_intent, - filter_results_by_category, - get_rrf_weights, - reciprocal_rank_fusion, - rerank_results, - simple_weighted_fusion, - tag_search_source, -) -from codexlens.storage.dir_index import DirIndexStore - -# Optional LSP imports (for real-time graph expansion) -try: - from codexlens.lsp import LspBridge, LspGraphBuilder - HAS_LSP = True -except ImportError: - HAS_LSP = False - - -# Three-way fusion weights (FTS + Vector + SPLADE) -THREE_WAY_WEIGHTS = { - "exact": 0.2, - "splade": 0.3, - "vector": 0.5, -} - - -class HybridSearchEngine: - """Hybrid search engine with parallel execution and RRF fusion. - - Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends, - executing them in parallel and fusing results via Reciprocal Rank Fusion. - - Attributes: - logger: Python logger instance - default_weights: Default RRF weights for each source - """ - - # NOTE: DEFAULT_WEIGHTS imported from ranking.py - single source of truth - # Default RRF weights: SPLADE-based hybrid (splade: 0.4, vector: 0.6) - # FTS fallback mode uses FTS_FALLBACK_WEIGHTS (exact: 0.3, fuzzy: 0.1, vector: 0.6) - - def __init__( - self, - weights: Optional[Dict[str, float]] = None, - config: Optional[Config] = None, - embedder: Any = None, - ): - """Initialize hybrid search engine. - - Args: - weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS) - config: Optional runtime config (enables optional reranking features) - embedder: Optional embedder instance for embedding-based reranking - - Raises: - TypeError: If weights is not a dict (e.g., if a Path is passed) - """ - self.logger = logging.getLogger(__name__) - - # Validate weights type to catch common usage errors - if weights is not None and not isinstance(weights, dict): - raise TypeError( - f"weights must be a dict, got {type(weights).__name__}. " - f"Did you mean to pass index_path to search() instead of __init__()?" - ) - - self.weights = weights or DEFAULT_WEIGHTS.copy() - self._config = config - self.embedder = embedder - self.reranker: Any = None - self._use_gpu = config.embedding_use_gpu if config else True - - def search( - self, - index_path: Path, - query: str, - limit: int = 20, - enable_fuzzy: bool = True, - enable_vector: bool = False, - pure_vector: bool = False, - enable_splade: bool = False, - enable_lsp_graph: bool = False, - lsp_max_depth: int = 1, - lsp_max_nodes: int = 20, - ) -> List[SearchResult]: - """Execute hybrid search with parallel retrieval and RRF fusion. - - Args: - index_path: Path to _index.db file - query: FTS5 query string (for FTS) or natural language query (for vector) - limit: Maximum results to return after fusion - enable_fuzzy: Enable fuzzy FTS search (default True) - enable_vector: Enable vector search (default False) - pure_vector: If True, only use vector search without FTS fallback (default False) - enable_splade: If True, force SPLADE sparse neural search (default False) - enable_lsp_graph: If True, enable real-time LSP graph expansion (default False) - lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1) - lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - List of SearchResult objects sorted by fusion score - - Examples: - >>> engine = HybridSearchEngine() - >>> # Hybrid search (exact + fuzzy + vector) - >>> results = engine.search(Path("project/_index.db"), "authentication", - ... enable_vector=True) - >>> # Pure vector search (semantic only) - >>> results = engine.search(Path("project/_index.db"), - ... "how to authenticate users", - ... enable_vector=True, pure_vector=True) - >>> # SPLADE sparse neural search - >>> results = engine.search(Path("project/_index.db"), "auth flow", - ... enable_splade=True, enable_vector=True) - >>> # With LSP graph expansion (real-time) - >>> results = engine.search(Path("project/_index.db"), "auth flow", - ... enable_vector=True, enable_lsp_graph=True) - >>> for r in results[:5]: - ... print(f"{r.path}: {r.score:.3f}") - """ - # Defensive: avoid creating/locking an index database when callers pass - # an empty placeholder file (common in tests and misconfigured callers). - try: - if index_path.exists() and index_path.stat().st_size == 0: - return [] - except OSError: - return [] - - # Detect query intent early for category filtering at index level - query_intent = detect_query_intent(query) - # Map intent to category for vector search: - # - KEYWORD (code intent) -> filter to 'code' only - # - SEMANTIC (doc intent) -> no filter (allow docs to surface) - # - MIXED -> no filter (allow all) - vector_category: Optional[str] = None - if query_intent == QueryIntent.KEYWORD: - vector_category = "code" - - # Determine which backends to use - backends = {} - - # Check if SPLADE is available - splade_available = False - # Respect config.enable_splade flag and use_fts_fallback flag - if self._config and getattr(self._config, 'use_fts_fallback', False): - # Config explicitly requests FTS fallback - disable SPLADE - splade_available = False - elif self._config and not getattr(self._config, 'enable_splade', True): - # Config explicitly disabled SPLADE - splade_available = False - else: - # Check if SPLADE dependencies are available - try: - from codexlens.semantic.splade_encoder import check_splade_available - ok, _ = check_splade_available() - if ok: - # SPLADE tables are in main index database, will check table existence in _search_splade - splade_available = True - except Exception: - pass - - if pure_vector: - # Pure vector mode: only use vector search, no FTS fallback - if enable_vector: - backends["vector"] = True - else: - # Invalid configuration: pure_vector=True but enable_vector=False - self.logger.warning( - "pure_vector=True requires enable_vector=True. " - "Falling back to exact search. " - "To use pure vector search, enable vector search mode." - ) - backends["exact"] = True - elif enable_splade: - # Explicit SPLADE mode requested via CLI --method splade - if splade_available: - backends["splade"] = True - if enable_vector: - backends["vector"] = True - else: - # SPLADE requested but not available - warn and fallback - self.logger.warning( - "SPLADE search requested but not available. " - "Falling back to FTS. Run 'codexlens index splade' to enable." - ) - backends["exact"] = True - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector: - backends["vector"] = True - else: - # Hybrid mode: default to SPLADE if available, otherwise use FTS - if splade_available: - # Default: enable SPLADE, disable exact and fuzzy - backends["splade"] = True - if enable_vector: - backends["vector"] = True - else: - # Fallback mode: enable exact+fuzzy when SPLADE unavailable - backends["exact"] = True - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector: - backends["vector"] = True - - # Add LSP graph expansion if requested and available - if enable_lsp_graph and HAS_LSP: - backends["lsp_graph"] = True - elif enable_lsp_graph and not HAS_LSP: - self.logger.warning( - "LSP graph search requested but dependencies not available. " - "Install: pip install aiohttp" - ) - - # Execute parallel searches - with timer("parallel_search_total", self.logger): - results_map = self._search_parallel( - index_path, query, backends, limit, vector_category, - lsp_max_depth, lsp_max_nodes - ) - - # Provide helpful message if pure-vector mode returns no results - if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0: - self.logger.warning( - "Pure vector search returned no results. " - "This usually means embeddings haven't been generated. " - "Run: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - - # Apply RRF fusion - # Filter weights to only active backends - active_weights = { - source: weight - for source, weight in self.weights.items() - if source in results_map - } - - # Determine fusion method from config (default: rrf) - fusion_method = "rrf" - rrf_k = 60 - if self._config is not None: - fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf" - rrf_k = getattr(self._config, "rrf_k", 60) or 60 - - with timer("fusion", self.logger): - adaptive_weights = get_rrf_weights(query, active_weights) - if fusion_method == "simple": - fused_results = simple_weighted_fusion(results_map, adaptive_weights) - else: - # Default to RRF - fused_results = reciprocal_rank_fusion( - results_map, adaptive_weights, k=rrf_k - ) - - # Optional: boost results that include explicit symbol matches - boost_factor = ( - self._config.symbol_boost_factor - if self._config is not None - else 1.5 - ) - with timer("symbol_boost", self.logger): - fused_results = apply_symbol_boost( - fused_results, boost_factor=boost_factor - ) - - # Optional: embedding-based reranking on top results - if self._config is not None and self._config.enable_reranking: - with timer("reranking", self.logger): - if self.embedder is None: - self.embedder = self._get_reranking_embedder() - fused_results = rerank_results( - query, - fused_results[:100], - self.embedder, - top_k=( - 100 - if self._config.enable_cross_encoder_rerank - else self._config.reranking_top_k - ), - ) - - # Optional: cross-encoder reranking as a second stage - if ( - self._config is not None - and self._config.enable_reranking - and self._config.enable_cross_encoder_rerank - ): - with timer("cross_encoder_rerank", self.logger): - if self.reranker is None: - self.reranker = self._get_cross_encoder_reranker() - if self.reranker is not None: - fused_results = cross_encoder_rerank( - query, - fused_results, - self.reranker, - top_k=self._config.reranker_top_k, - ) - - # Apply category filtering to avoid code/doc pollution - # This ensures KEYWORD queries return code files, SEMANTIC queries prefer docs - enable_category_filter = ( - self._config is None - or getattr(self._config, 'enable_category_filter', True) - ) - if enable_category_filter and not pure_vector: - with timer("category_filter", self.logger): - query_intent = detect_query_intent(query) - fused_results = filter_results_by_category( - fused_results, query_intent, allow_mixed=True - ) - - # Apply final limit - return fused_results[:limit] - - def _get_reranking_embedder(self) -> Any: - """Create an embedder for reranking based on Config embedding settings.""" - if self._config is None: - return None - - try: - from codexlens.semantic.factory import get_embedder - except Exception as exc: - self.logger.debug("Reranking embedder unavailable: %s", exc) - return None - - try: - if self._config.embedding_backend == "fastembed": - return get_embedder( - backend="fastembed", - profile=self._config.embedding_model, - use_gpu=self._config.embedding_use_gpu, - ) - if self._config.embedding_backend == "litellm": - return get_embedder( - backend="litellm", - model=self._config.embedding_model, - endpoints=self._config.embedding_endpoints, - strategy=self._config.embedding_strategy, - cooldown=self._config.embedding_cooldown, - ) - except Exception as exc: - self.logger.debug("Failed to initialize reranking embedder: %s", exc) - return None - - self.logger.debug( - "Unknown embedding backend for reranking: %s", - self._config.embedding_backend, - ) - return None - - def _get_cross_encoder_reranker(self) -> Any: - if self._config is None: - return None - - try: - from codexlens.semantic.reranker import ( - check_reranker_available, - get_reranker, - ) - except Exception as exc: - self.logger.debug("Reranker factory unavailable: %s", exc) - return None - - backend = (getattr(self._config, "reranker_backend", "") or "").strip().lower() or "onnx" - - ok, err = check_reranker_available(backend) - if not ok: - self.logger.debug( - "Reranker backend unavailable (backend=%s): %s", - backend, - err, - ) - return None - - try: - model_name = (getattr(self._config, "reranker_model", "") or "").strip() or None - - if backend != "legacy" and model_name == "cross-encoder/ms-marco-MiniLM-L-6-v2": - model_name = None - - device: str | None = None - kwargs: dict[str, Any] = {} - - if backend == "onnx": - kwargs["use_gpu"] = bool(getattr(self._config, "embedding_use_gpu", True)) - elif backend == "legacy": - if not bool(getattr(self._config, "embedding_use_gpu", True)): - device = "cpu" - elif backend == "api": - # Pass max_input_tokens for adaptive batching - max_tokens = getattr(self._config, "reranker_max_input_tokens", None) - if max_tokens: - kwargs["max_input_tokens"] = max_tokens - - return get_reranker( - backend=backend, - model_name=model_name, - device=device, - **kwargs, - ) - except Exception as exc: - self.logger.debug( - "Failed to initialize reranker (backend=%s): %s", - backend, - exc, - ) - return None - - def _search_parallel( - self, - index_path: Path, - query: str, - backends: Dict[str, bool], - limit: int, - category: Optional[str] = None, - lsp_max_depth: int = 1, - lsp_max_nodes: int = 20, - ) -> Dict[str, List[SearchResult]]: - """Execute parallel searches across enabled backends. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - backends: Dictionary of backend name to enabled flag - limit: Results limit per backend - category: Optional category filter for vector search ('code' or 'doc') - lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1) - lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - Dictionary mapping source name to results list - """ - results_map: Dict[str, List[SearchResult]] = {} - timing_data: Dict[str, float] = {} - - # Use ThreadPoolExecutor for parallel I/O-bound searches - with ThreadPoolExecutor(max_workers=len(backends)) as executor: - # Submit search tasks with timing - future_to_source = {} - submit_times = {} - - if backends.get("exact"): - submit_times["exact"] = time.perf_counter() - future = executor.submit( - self._search_exact, index_path, query, limit - ) - future_to_source[future] = "exact" - - if backends.get("fuzzy"): - submit_times["fuzzy"] = time.perf_counter() - future = executor.submit( - self._search_fuzzy, index_path, query, limit - ) - future_to_source[future] = "fuzzy" - - if backends.get("vector"): - submit_times["vector"] = time.perf_counter() - future = executor.submit( - self._search_vector, index_path, query, limit, category - ) - future_to_source[future] = "vector" - - if backends.get("splade"): - submit_times["splade"] = time.perf_counter() - future = executor.submit( - self._search_splade, index_path, query, limit - ) - future_to_source[future] = "splade" - - if backends.get("lsp_graph"): - submit_times["lsp_graph"] = time.perf_counter() - future = executor.submit( - self._search_lsp_graph, index_path, query, limit, - lsp_max_depth, lsp_max_nodes - ) - future_to_source[future] = "lsp_graph" - - # Collect results as they complete with timeout protection - try: - for future in as_completed(future_to_source, timeout=30.0): - source = future_to_source[future] - elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000 - timing_data[source] = elapsed_ms - try: - results = future.result(timeout=10.0) - # Tag results with source for debugging - tagged_results = tag_search_source(results, source) - results_map[source] = tagged_results - self.logger.debug( - "[TIMING] %s_search: %.2fms (%d results)", - source, elapsed_ms, len(results) - ) - except (Exception, FuturesTimeoutError) as exc: - self.logger.error("Search failed for %s: %s", source, exc) - results_map[source] = [] - except FuturesTimeoutError: - self.logger.warning("Search timeout: some backends did not respond in time") - # Cancel remaining futures - for future in future_to_source: - future.cancel() - # Set empty results for sources that didn't complete - for source in backends: - if source not in results_map: - results_map[source] = [] - - # Log timing summary - if timing_data: - timing_str = ", ".join(f"{k}={v:.1f}ms" for k, v in timing_data.items()) - self.logger.debug("[TIMING] search_backends: {%s}", timing_str) - - return results_map - - def _search_exact( - self, index_path: Path, query: str, limit: int - ) -> List[SearchResult]: - """Execute exact FTS search using unicode61 tokenizer. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - limit: Maximum results - - Returns: - List of SearchResult objects - """ - try: - with DirIndexStore(index_path) as store: - return store.search_fts_exact( - query, limit=limit, return_full_content=True - ) - except Exception as exc: - self.logger.debug("Exact search error: %s", exc) - return [] - - def _search_fuzzy( - self, index_path: Path, query: str, limit: int - ) -> List[SearchResult]: - """Execute fuzzy FTS search using trigram/extended unicode61 tokenizer. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - limit: Maximum results - - Returns: - List of SearchResult objects - """ - try: - with DirIndexStore(index_path) as store: - return store.search_fts_fuzzy( - query, limit=limit, return_full_content=True - ) - except Exception as exc: - self.logger.debug("Fuzzy search error: %s", exc) - return [] - - def _find_vectors_hnsw(self, index_path: Path) -> Optional[Path]: - """Find the centralized _vectors.hnsw file by traversing up from index_path. - - Similar to _search_splade's approach, this method searches for the - centralized dense vector index file in parent directories. - - Args: - index_path: Path to the current _index.db file - - Returns: - Path to _vectors.hnsw if found, None otherwise - """ - current_dir = index_path.parent - for _ in range(10): # Limit search depth - candidate = current_dir / VECTORS_HNSW_NAME - if candidate.exists(): - return candidate - parent = current_dir.parent - if parent == current_dir: # Reached root - break - current_dir = parent - return None - - def _search_vector_centralized( - self, - index_path: Path, - hnsw_path: Path, - query: str, - limit: int, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Search using centralized vector index. - - Args: - index_path: Path to _index.db file (for metadata lookup) - hnsw_path: Path to centralized _vectors.hnsw file - query: Natural language query string - limit: Maximum results - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult objects ordered by semantic similarity - """ - try: - import sqlite3 - import json - from codexlens.semantic.factory import get_embedder - from codexlens.semantic.ann_index import ANNIndex - - # Get model config from the first index database we can find - # (all indexes should use the same embedding model) - index_root = hnsw_path.parent - model_config = None - - # Try to get model config from the centralized index root first - # (not the sub-directory index_path, which may have outdated config) - try: - from codexlens.semantic.vector_store import VectorStore - central_index_path = index_root / "_index.db" - if central_index_path.exists(): - with VectorStore(central_index_path) as vs: - model_config = vs.get_model_config() - self.logger.debug( - "Loaded model config from centralized index: %s", - model_config - ) - except Exception as e: - self.logger.debug("Failed to load model config from centralized index: %s", e) - - # Detect dimension from HNSW file if model config not found - if model_config is None: - self.logger.debug("Model config not found, will detect from HNSW index") - # Create a temporary ANNIndex to load and detect dimension - # We need to know the dimension to properly load the index - - # Get embedder based on model config or default - if model_config: - backend = model_config.get("backend", "fastembed") - model_name = model_config["model_name"] - model_profile = model_config["model_profile"] - embedding_dim = model_config["embedding_dim"] - - if backend == "litellm": - embedder = get_embedder(backend="litellm", model=model_name) - else: - embedder = get_embedder(backend="fastembed", profile=model_profile) - else: - # Default to code profile - embedder = get_embedder(backend="fastembed", profile="code") - embedding_dim = embedder.embedding_dim - - # Load centralized ANN index - start_load = time.perf_counter() - ann_index = ANNIndex.create_central( - index_root=index_root, - dim=embedding_dim, - ) - if not ann_index.load(): - self.logger.warning("Failed to load centralized vector index from %s", hnsw_path) - return [] - self.logger.debug( - "[TIMING] central_ann_load: %.2fms (%d vectors)", - (time.perf_counter() - start_load) * 1000, - ann_index.count() - ) - - # Generate query embedding - start_embed = time.perf_counter() - query_embedding = embedder.embed_single(query) - self.logger.debug( - "[TIMING] query_embedding: %.2fms", - (time.perf_counter() - start_embed) * 1000 - ) - - # Search ANN index - start_search = time.perf_counter() - import numpy as np - query_vec = np.array(query_embedding, dtype=np.float32) - ids, distances = ann_index.search(query_vec, top_k=limit * 2) # Fetch extra for filtering - self.logger.debug( - "[TIMING] central_ann_search: %.2fms (%d results)", - (time.perf_counter() - start_search) * 1000, - len(ids) if ids else 0 - ) - - if not ids: - return [] - - # Convert distances to similarity scores (for cosine: score = 1 - distance) - scores = [1.0 - d for d in distances] - - # Fetch chunk metadata from semantic_chunks tables - # We need to search across all _index.db files in the project - results = self._fetch_chunks_by_ids_centralized( - index_root, ids, scores, category - ) - - return results[:limit] - - except ImportError as exc: - self.logger.debug("Semantic dependencies not available: %s", exc) - return [] - except Exception as exc: - self.logger.error("Centralized vector search error: %s", exc) - return [] - - def _fetch_chunks_by_ids_centralized( - self, - index_root: Path, - chunk_ids: List[int], - scores: List[float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch chunk metadata from centralized _vectors_meta.db for fast lookup. - - This method uses the centralized VectorMetadataStore for O(1) lookup - instead of traversing all _index.db files (O(n) where n = number of indexes). - - Falls back to the legacy per-index lookup if centralized metadata is unavailable. - - Args: - index_root: Root directory containing _vectors_meta.db - chunk_ids: List of chunk IDs from ANN search - scores: Corresponding similarity scores - category: Optional category filter - - Returns: - List of SearchResult objects - """ - from codexlens.config import VECTORS_META_DB_NAME - - # Build score map - score_map = {cid: score for cid, score in zip(chunk_ids, scores)} - - # Try centralized metadata store first (fast path) - vectors_meta_path = index_root / VECTORS_META_DB_NAME - if vectors_meta_path.exists(): - try: - return self._fetch_from_vector_meta_store( - vectors_meta_path, chunk_ids, score_map, category - ) - except Exception as e: - self.logger.warning( - "Centralized metadata lookup failed, falling back to legacy traversal: %s. " - "Consider regenerating embeddings with: codexlens embeddings-generate --centralized", - e - ) - - # Fallback: traverse _index.db files (legacy path) - return self._fetch_chunks_by_ids_legacy( - index_root, chunk_ids, score_map, category - ) - - def _fetch_from_vector_meta_store( - self, - meta_db_path: Path, - chunk_ids: List[int], - score_map: Dict[int, float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch chunks from centralized VectorMetadataStore. - - Args: - meta_db_path: Path to _vectors_meta.db - chunk_ids: List of chunk IDs to fetch - score_map: Mapping of chunk_id to score - category: Optional category filter - - Returns: - List of SearchResult objects - """ - from codexlens.storage.vector_meta_store import VectorMetadataStore - - results = [] - - with VectorMetadataStore(meta_db_path) as meta_store: - rows = meta_store.get_chunks_by_ids(chunk_ids, category=category) - - for row in rows: - chunk_id = row["chunk_id"] - file_path = row["file_path"] - content = row["content"] or "" - metadata = row.get("metadata") or {} - start_line = row.get("start_line") - end_line = row.get("end_line") - - score = score_map.get(chunk_id, 0.0) - - # Build excerpt - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - - # Build Symbol object if available - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - # Sort by score descending - results.sort(key=lambda r: r.score, reverse=True) - return results - - def _fetch_chunks_by_ids_legacy( - self, - index_root: Path, - chunk_ids: List[int], - score_map: Dict[int, float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Legacy fallback: fetch chunk metadata by traversing all _index.db files. - - This is the O(n) fallback path used when centralized metadata is unavailable. - - Args: - index_root: Root directory containing _index.db files - chunk_ids: List of chunk IDs from ANN search - score_map: Mapping of chunk_id to score - category: Optional category filter - - Returns: - List of SearchResult objects - """ - import sqlite3 - import json - - # Find all _index.db files - index_files = list(index_root.rglob("_index.db")) - - results = [] - found_ids = set() - - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - - # Check if semantic_chunks table exists - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - if cursor.fetchone() is None: - continue - - # Build query for chunk IDs we haven't found yet - remaining_ids = [cid for cid in chunk_ids if cid not in found_ids] - if not remaining_ids: - break - - placeholders = ",".join("?" * len(remaining_ids)) - - if category: - query = f""" - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) AND category = ? - """ - params = remaining_ids + [category] - else: - query = f""" - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) - """ - params = remaining_ids - - rows = conn.execute(query, params).fetchall() - - for row in rows: - chunk_id = row["id"] - if chunk_id in found_ids: - continue - found_ids.add(chunk_id) - - file_path = row["file_path"] - content = row["content"] - metadata_json = row["metadata"] - metadata = json.loads(metadata_json) if metadata_json else {} - - score = score_map.get(chunk_id, 0.0) - - # Build excerpt - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - start_line = metadata.get("start_line") - end_line = metadata.get("end_line") - - # Build Symbol object if available - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - except Exception as e: - self.logger.debug("Failed to fetch chunks from %s: %s", index_path, e) - continue - - # Sort by score descending - results.sort(key=lambda r: r.score, reverse=True) - return results - - def _search_vector( - self, index_path: Path, query: str, limit: int, category: Optional[str] = None - ) -> List[SearchResult]: - """Execute vector similarity search using semantic embeddings. - - Supports both centralized vector storage (single _vectors.hnsw at project root) - and distributed storage (per-directory .hnsw files). - - Args: - index_path: Path to _index.db file - query: Natural language query string - limit: Maximum results - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult objects ordered by semantic similarity - """ - try: - # First, check for centralized vector index - central_hnsw_path = self._find_vectors_hnsw(index_path) - if central_hnsw_path is not None: - self.logger.debug("Found centralized vector index at %s", central_hnsw_path) - return self._search_vector_centralized( - index_path, central_hnsw_path, query, limit, category - ) - - # Fallback to distributed (per-index) vector storage - # Check if semantic chunks table exists - import sqlite3 - - start_check = time.perf_counter() - try: - with sqlite3.connect(index_path) as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - has_semantic_table = cursor.fetchone() is not None - except sqlite3.Error as e: - self.logger.error("Database check failed in vector search: %s", e) - return [] - self.logger.debug( - "[TIMING] vector_table_check: %.2fms", - (time.perf_counter() - start_check) * 1000 - ) - - if not has_semantic_table: - self.logger.info( - "No embeddings found in index. " - "Generate embeddings with: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - return [] - - # Initialize embedder and vector store - from codexlens.semantic.factory import get_embedder - from codexlens.semantic.vector_store import VectorStore - - start_init = time.perf_counter() - vector_store = VectorStore(index_path) - self.logger.debug( - "[TIMING] vector_store_init: %.2fms", - (time.perf_counter() - start_init) * 1000 - ) - - # Check if vector store has data - if vector_store.count_chunks() == 0: - self.logger.info( - "Vector store is empty (0 chunks). " - "Generate embeddings with: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - return [] - - # Get stored model configuration (preferred) or auto-detect from dimension - start_embedder = time.perf_counter() - model_config = vector_store.get_model_config() - if model_config: - backend = model_config.get("backend", "fastembed") - model_name = model_config["model_name"] - model_profile = model_config["model_profile"] - self.logger.debug( - "Using stored model config: %s backend, %s (%s, %dd)", - backend, model_profile, model_name, model_config["embedding_dim"] - ) - - # Get embedder based on backend - if backend == "litellm": - embedder = get_embedder(backend="litellm", model=model_name) - else: - embedder = get_embedder(backend="fastembed", profile=model_profile) - else: - # Fallback: auto-detect from embedding dimension - detected_dim = vector_store.dimension - if detected_dim is None: - self.logger.info("Vector store dimension unknown, using default profile") - embedder = get_embedder(backend="fastembed", profile="code") - elif detected_dim == 384: - embedder = get_embedder(backend="fastembed", profile="fast") - elif detected_dim == 768: - embedder = get_embedder(backend="fastembed", profile="code") - elif detected_dim == 1024: - embedder = get_embedder(backend="fastembed", profile="multilingual") - elif detected_dim == 1536: - # Likely OpenAI text-embedding-3-small or ada-002 - self.logger.info( - "Detected 1536-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-small" - ) - embedder = get_embedder(backend="litellm", model="text-embedding-3-small") - elif detected_dim == 3072: - # Likely OpenAI text-embedding-3-large - self.logger.info( - "Detected 3072-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-large" - ) - embedder = get_embedder(backend="litellm", model="text-embedding-3-large") - else: - self.logger.debug( - "Unknown dimension %s, using default fastembed profile 'code'", - detected_dim - ) - embedder = get_embedder(backend="fastembed", profile="code") - self.logger.debug( - "[TIMING] embedder_init: %.2fms", - (time.perf_counter() - start_embedder) * 1000 - ) - - # Generate query embedding - start_embed = time.perf_counter() - query_embedding = embedder.embed_single(query) - self.logger.debug( - "[TIMING] query_embedding: %.2fms", - (time.perf_counter() - start_embed) * 1000 - ) - - # Search for similar chunks - start_search = time.perf_counter() - results = vector_store.search_similar( - query_embedding=query_embedding, - top_k=limit, - min_score=0.0, # Return all results, let RRF handle filtering - return_full_content=True, - category=category, - ) - self.logger.debug( - "[TIMING] vector_similarity_search: %.2fms (%d results)", - (time.perf_counter() - start_search) * 1000, len(results) - ) - - return results - - except ImportError as exc: - self.logger.debug("Semantic dependencies not available: %s", exc) - return [] - except Exception as exc: - self.logger.error("Vector search error: %s", exc) - return [] - - def _search_splade( - self, index_path: Path, query: str, limit: int - ) -> List[SearchResult]: - """SPLADE sparse retrieval via inverted index. - - Args: - index_path: Path to _index.db file - query: Natural language query string - limit: Maximum results - - Returns: - List of SearchResult ordered by SPLADE score - """ - try: - from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available - from codexlens.storage.splade_index import SpladeIndex - from codexlens.config import SPLADE_DB_NAME - import sqlite3 - import json - - # Check dependencies - ok, err = check_splade_available() - if not ok: - self.logger.debug("SPLADE not available: %s", err) - return [] - - # SPLADE index is stored in _splade.db at the project index root - # Traverse up from the current index to find the root _splade.db - current_dir = index_path.parent - splade_db_path = None - for _ in range(10): # Limit search depth - candidate = current_dir / SPLADE_DB_NAME - if candidate.exists(): - splade_db_path = candidate - break - parent = current_dir.parent - if parent == current_dir: # Reached root - break - current_dir = parent - - if not splade_db_path: - self.logger.debug("SPLADE index not found in ancestor directories of %s", index_path) - return [] - - splade_index = SpladeIndex(splade_db_path) - if not splade_index.has_index(): - self.logger.debug("SPLADE index not initialized") - return [] - - # Encode query to sparse vector - encoder = get_splade_encoder(use_gpu=self._use_gpu) - query_sparse = encoder.encode_text(query) - - # Search inverted index for top matches - raw_results = splade_index.search(query_sparse, limit=limit, min_score=0.0) - - if not raw_results: - return [] - - # Fetch chunk details from splade_chunks table (self-contained) - chunk_ids = [chunk_id for chunk_id, _ in raw_results] - score_map = {chunk_id: score for chunk_id, score in raw_results} - - # Get chunk metadata from SPLADE database - rows = splade_index.get_chunks_by_ids(chunk_ids) - - # Build SearchResult objects - results = [] - for row in rows: - chunk_id = row["id"] - file_path = row["file_path"] - content = row["content"] - metadata_json = row["metadata"] - metadata = json.loads(metadata_json) if metadata_json else {} - - score = score_map.get(chunk_id, 0.0) - - # Build excerpt (short preview) - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information from metadata - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - start_line = metadata.get("start_line") - end_line = metadata.get("end_line") - - # Build Symbol object if we have symbol info - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - return results - - except Exception as exc: - self.logger.debug("SPLADE search error: %s", exc) - return [] - - def _search_lsp_graph( - self, - index_path: Path, - query: str, - limit: int, - max_depth: int = 1, - max_nodes: int = 20, - ) -> List[SearchResult]: - """Execute LSP-based graph expansion search. - - Uses real-time LSP to expand from seed results and find related code. - This provides accurate, up-to-date code relationships. - - Args: - index_path: Path to _index.db file - query: Natural language query string - limit: Maximum results - max_depth: Maximum depth for LSP graph BFS expansion (default 1) - max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - List of SearchResult from graph expansion - """ - import asyncio - - if not HAS_LSP: - self.logger.debug("LSP dependencies not available") - return [] - - try: - # Try multiple seed sources in priority order - seeds = [] - seed_source = "none" - - # 1. Try vector search first (best semantic match) - seeds = self._search_vector(index_path, query, limit=3, category="code") - if seeds: - seed_source = "vector" - - # 2. Fallback to SPLADE if vector returns nothing - if not seeds: - self.logger.debug("Vector search returned no seeds, trying SPLADE") - seeds = self._search_splade(index_path, query, limit=3) - if seeds: - seed_source = "splade" - - # 3. Fallback to exact FTS if SPLADE also fails - if not seeds: - self.logger.debug("SPLADE returned no seeds, trying exact FTS") - seeds = self._search_exact(index_path, query, limit=3) - if seeds: - seed_source = "exact_fts" - - # 4. No seeds available from any source - if not seeds: - self.logger.debug("No seed results available for LSP graph expansion") - return [] - - self.logger.debug( - "LSP graph expansion using %d seeds from %s", - len(seeds), - seed_source, - ) - - # Convert SearchResult to CodeSymbolNode for LSP processing - from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range - - seed_nodes = [] - for seed in seeds: - try: - node = CodeSymbolNode( - id=f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}", - name=seed.symbol_name or "unknown", - kind=seed.symbol_kind or "unknown", - file_path=seed.path, - range=Range( - start_line=seed.start_line or 1, - start_character=0, - end_line=seed.end_line or seed.start_line or 1, - end_character=0, - ), - raw_code=seed.content or "", - docstring=seed.excerpt or "", - ) - seed_nodes.append(node) - except Exception as e: - self.logger.debug("Failed to create seed node: %s", e) - continue - - if not seed_nodes: - return [] - - # Run async LSP expansion in sync context - async def expand_graph(): - async with LspBridge() as bridge: - builder = LspGraphBuilder(max_depth=max_depth, max_nodes=max_nodes) - graph = await builder.build_from_seeds(seed_nodes, bridge) - return graph - - # Run the async code - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - # Already in async context - use run_coroutine_threadsafe - import concurrent.futures - future = asyncio.run_coroutine_threadsafe(expand_graph(), loop) - graph = future.result(timeout=5.0) - else: - graph = loop.run_until_complete(expand_graph()) - except RuntimeError: - # No event loop - create new one - graph = asyncio.run(expand_graph()) - - # Convert graph nodes to SearchResult - # Create set of seed identifiers for fast lookup - seed_ids = set() - for seed in seeds: - seed_id = f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}" - seed_ids.add(seed_id) - - results = [] - for node_id, node in graph.nodes.items(): - # Skip seed nodes using ID comparison (already in other results) - if node_id in seed_ids or node.id in seed_ids: - continue - - # Calculate score based on graph position - # Nodes closer to seeds get higher scores - depth = 1 # Simple heuristic, could be improved - score = 0.8 / (1 + depth) # Score decreases with depth - - results.append(SearchResult( - path=node.file_path, - score=score, - excerpt=node.docstring[:200] if node.docstring else node.raw_code[:200] if node.raw_code else "", - content=node.raw_code, - symbol=None, - metadata={"lsp_node_id": node_id, "lsp_kind": node.kind}, - start_line=node.range.start_line, - end_line=node.range.end_line, - symbol_name=node.name, - symbol_kind=node.kind, - )) - - # Sort by score - results.sort(key=lambda r: r.score, reverse=True) - return results[:limit] - - except Exception as exc: - self.logger.debug("LSP graph search error: %s", exc) - return [] diff --git a/codex-lens/build/lib/codexlens/search/query_parser.py b/codex-lens/build/lib/codexlens/search/query_parser.py deleted file mode 100644 index 05b337f5..00000000 --- a/codex-lens/build/lib/codexlens/search/query_parser.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Query preprocessing for CodexLens search. - -Provides query expansion for better identifier matching: -- CamelCase splitting: UserAuth → User OR Auth -- snake_case splitting: user_auth → user OR auth -- Preserves original query for exact matching -""" - -from __future__ import annotations - -import logging -import re -from typing import Set, List - -log = logging.getLogger(__name__) - - -class QueryParser: - """Parser for preprocessing search queries before FTS5 execution. - - Expands identifier-style queries (CamelCase, snake_case) into OR queries - to improve recall when searching for code symbols. - - Example transformations: - - 'UserAuth' → 'UserAuth OR User OR Auth' - - 'user_auth' → 'user_auth OR user OR auth' - - 'getUserData' → 'getUserData OR get OR User OR Data' - """ - - # Patterns for identifier splitting - CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])') - SNAKE_CASE_PATTERN = re.compile(r'_+') - KEBAB_CASE_PATTERN = re.compile(r'-+') - - # Minimum token length to include in expansion (avoid noise from single chars) - MIN_TOKEN_LENGTH = 2 - - # All-caps acronyms pattern (e.g., HTTP, SQL, API) - ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$') - - def __init__(self, enable: bool = True, min_token_length: int = 2): - """Initialize query parser. - - Args: - enable: Whether to enable query preprocessing - min_token_length: Minimum token length to include in expansion - """ - self.enable = enable - self.min_token_length = min_token_length - - def preprocess_query(self, query: str) -> str: - """Preprocess query with identifier expansion. - - Args: - query: Original search query - - Returns: - Expanded query with OR operator connecting original and split tokens - - Example: - >>> parser = QueryParser() - >>> parser.preprocess_query('UserAuth') - 'UserAuth OR User OR Auth' - >>> parser.preprocess_query('get_user_data') - 'get_user_data OR get OR user OR data' - """ - if not self.enable: - return query - - query = query.strip() - if not query: - return query - - # Extract tokens from query (handle multiple words/terms) - # For simple queries, just process the whole thing - # For complex FTS5 queries with operators, preserve structure - if self._is_simple_query(query): - return self._expand_simple_query(query) - else: - # Complex query with FTS5 operators, don't expand - log.debug(f"Skipping expansion for complex FTS5 query: {query}") - return query - - def _is_simple_query(self, query: str) -> bool: - """Check if query is simple (no FTS5 operators). - - Args: - query: Search query - - Returns: - True if query is simple (safe to expand), False otherwise - """ - # Check for FTS5 operators that indicate complex query - fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"'] - return not any(op in query for op in fts5_operators) - - def _expand_simple_query(self, query: str) -> str: - """Expand a simple query with identifier splitting. - - Args: - query: Simple search query - - Returns: - Expanded query with OR operators - """ - tokens: Set[str] = set() - - # Always include original query - tokens.add(query) - - # Split on whitespace first - words = query.split() - - for word in words: - # Extract tokens from this word - word_tokens = self._extract_tokens(word) - tokens.update(word_tokens) - - # Filter out short tokens and duplicates - filtered_tokens = [ - t for t in tokens - if len(t) >= self.min_token_length - ] - - # Remove duplicates while preserving original query first - unique_tokens: List[str] = [] - seen: Set[str] = set() - - # Always put original query first - if query not in seen and len(query) >= self.min_token_length: - unique_tokens.append(query) - seen.add(query) - - # Add other tokens - for token in filtered_tokens: - if token not in seen: - unique_tokens.append(token) - seen.add(token) - - # Join with OR operator (only if we have multiple tokens) - if len(unique_tokens) > 1: - expanded = ' OR '.join(unique_tokens) - log.debug(f"Expanded query: '{query}' → '{expanded}'") - return expanded - else: - return query - - def _extract_tokens(self, word: str) -> Set[str]: - """Extract tokens from a single word using various splitting strategies. - - Args: - word: Single word/identifier to split - - Returns: - Set of extracted tokens - """ - tokens: Set[str] = set() - - # Add original word - tokens.add(word) - - # Handle all-caps acronyms (don't split) - if self.ALL_CAPS_PATTERN.match(word): - return tokens - - # CamelCase splitting - camel_tokens = self._split_camel_case(word) - tokens.update(camel_tokens) - - # snake_case splitting - snake_tokens = self._split_snake_case(word) - tokens.update(snake_tokens) - - # kebab-case splitting - kebab_tokens = self._split_kebab_case(word) - tokens.update(kebab_tokens) - - return tokens - - def _split_camel_case(self, word: str) -> List[str]: - """Split CamelCase identifier into tokens. - - Args: - word: CamelCase identifier (e.g., 'getUserData') - - Returns: - List of tokens (e.g., ['get', 'User', 'Data']) - """ - # Insert space before uppercase letters preceded by lowercase - spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word) - # Split on spaces and filter empty - return [t for t in spaced.split() if t] - - def _split_snake_case(self, word: str) -> List[str]: - """Split snake_case identifier into tokens. - - Args: - word: snake_case identifier (e.g., 'get_user_data') - - Returns: - List of tokens (e.g., ['get', 'user', 'data']) - """ - # Split on underscores - return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t] - - def _split_kebab_case(self, word: str) -> List[str]: - """Split kebab-case identifier into tokens. - - Args: - word: kebab-case identifier (e.g., 'get-user-data') - - Returns: - List of tokens (e.g., ['get', 'user', 'data']) - """ - # Split on hyphens - return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t] - - -# Global default parser instance -_default_parser = QueryParser(enable=True) - - -def preprocess_query(query: str, enable: bool = True) -> str: - """Convenience function for query preprocessing. - - Args: - query: Original search query - enable: Whether to enable preprocessing - - Returns: - Preprocessed query with identifier expansion - """ - if not enable: - return query - - return _default_parser.preprocess_query(query) - - -__all__ = [ - "QueryParser", - "preprocess_query", -] diff --git a/codex-lens/build/lib/codexlens/search/ranking.py b/codex-lens/build/lib/codexlens/search/ranking.py deleted file mode 100644 index 256c78bd..00000000 --- a/codex-lens/build/lib/codexlens/search/ranking.py +++ /dev/null @@ -1,942 +0,0 @@ -"""Ranking algorithms for hybrid search result fusion. - -Implements Reciprocal Rank Fusion (RRF) and score normalization utilities -for combining results from heterogeneous search backends (SPLADE, exact FTS, fuzzy FTS, vector search). -""" - -from __future__ import annotations - -import re -import math -from enum import Enum -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.entities import SearchResult, AdditionalLocation - - -# Default RRF weights for SPLADE-based hybrid search -DEFAULT_WEIGHTS = { - "splade": 0.35, # Replaces exact(0.3) + fuzzy(0.1) - "vector": 0.5, - "lsp_graph": 0.15, # Real-time LSP-based graph expansion -} - -# Legacy weights for FTS fallback mode (when SPLADE unavailable) -FTS_FALLBACK_WEIGHTS = { - "exact": 0.25, - "fuzzy": 0.1, - "vector": 0.5, - "lsp_graph": 0.15, # Real-time LSP-based graph expansion -} - - -class QueryIntent(str, Enum): - """Query intent for adaptive RRF weights (Python/TypeScript parity).""" - - KEYWORD = "keyword" - SEMANTIC = "semantic" - MIXED = "mixed" - - -def normalize_weights(weights: Dict[str, float | None]) -> Dict[str, float | None]: - """Normalize weights to sum to 1.0 (best-effort).""" - total = sum(float(v) for v in weights.values() if v is not None) - - # NaN total: do not attempt to normalize (division would propagate NaNs). - if math.isnan(total): - return dict(weights) - - # Infinite total: do not attempt to normalize (division yields 0 or NaN). - if not math.isfinite(total): - return dict(weights) - - # Zero/negative total: do not attempt to normalize (invalid denominator). - if total <= 0: - return dict(weights) - - return {k: (float(v) / total if v is not None else None) for k, v in weights.items()} - - -def detect_query_intent(query: str) -> QueryIntent: - """Detect whether a query is code-like, natural-language, or mixed. - - Heuristic signals kept aligned with `ccw/src/tools/smart-search.ts`. - """ - trimmed = (query or "").strip() - if not trimmed: - return QueryIntent.MIXED - - lower = trimmed.lower() - word_count = len([w for w in re.split(r"\s+", trimmed) if w]) - - has_code_signals = bool( - re.search(r"(::|->|\.)", trimmed) - or re.search(r"[A-Z][a-z]+[A-Z]", trimmed) - or re.search(r"\b\w+_\w+\b", trimmed) - or re.search( - r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b", - lower, - flags=re.IGNORECASE, - ) - ) - has_natural_signals = bool( - word_count > 5 - or "?" in trimmed - or re.search(r"\b(how|what|why|when|where)\b", trimmed, flags=re.IGNORECASE) - or re.search( - r"\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b", - trimmed, - flags=re.IGNORECASE, - ) - ) - - if has_code_signals and has_natural_signals: - return QueryIntent.MIXED - if has_code_signals: - return QueryIntent.KEYWORD - if has_natural_signals: - return QueryIntent.SEMANTIC - return QueryIntent.MIXED - - -def adjust_weights_by_intent( - intent: QueryIntent, - base_weights: Dict[str, float], -) -> Dict[str, float]: - """Adjust RRF weights based on query intent.""" - # Check if using SPLADE or FTS mode - use_splade = "splade" in base_weights - - if intent == QueryIntent.KEYWORD: - if use_splade: - target = {"splade": 0.6, "vector": 0.4} - else: - target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4} - elif intent == QueryIntent.SEMANTIC: - if use_splade: - target = {"splade": 0.3, "vector": 0.7} - else: - target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7} - else: - target = dict(base_weights) - - # Filter to active backends - keys = list(base_weights.keys()) - filtered = {k: float(target.get(k, 0.0)) for k in keys} - return normalize_weights(filtered) - - -def get_rrf_weights( - query: str, - base_weights: Dict[str, float], -) -> Dict[str, float]: - """Compute adaptive RRF weights from query intent.""" - return adjust_weights_by_intent(detect_query_intent(query), base_weights) - - -# File extensions to category mapping for fast lookup -_EXT_TO_CATEGORY: Dict[str, str] = { - # Code extensions - ".py": "code", ".js": "code", ".jsx": "code", ".ts": "code", ".tsx": "code", - ".java": "code", ".go": "code", ".zig": "code", ".m": "code", ".mm": "code", - ".c": "code", ".h": "code", ".cc": "code", ".cpp": "code", ".hpp": "code", ".cxx": "code", - ".rs": "code", - # Doc extensions - ".md": "doc", ".mdx": "doc", ".txt": "doc", ".rst": "doc", -} - - -def get_file_category(path: str) -> Optional[str]: - """Get file category ('code' or 'doc') from path extension. - - Args: - path: File path string - - Returns: - 'code', 'doc', or None if unknown - """ - ext = Path(path).suffix.lower() - return _EXT_TO_CATEGORY.get(ext) - - -def filter_results_by_category( - results: List[SearchResult], - intent: QueryIntent, - allow_mixed: bool = True, -) -> List[SearchResult]: - """Filter results by category based on query intent. - - Strategy: - - KEYWORD (code intent): Only return code files - - SEMANTIC (doc intent): Prefer docs, but allow code if allow_mixed=True - - MIXED: Return all results - - Args: - results: List of SearchResult objects - intent: Query intent from detect_query_intent() - allow_mixed: If True, SEMANTIC intent includes code files with lower priority - - Returns: - Filtered and re-ranked list of SearchResult objects - """ - if not results or intent == QueryIntent.MIXED: - return results - - code_results = [] - doc_results = [] - unknown_results = [] - - for r in results: - category = get_file_category(r.path) - if category == "code": - code_results.append(r) - elif category == "doc": - doc_results.append(r) - else: - unknown_results.append(r) - - if intent == QueryIntent.KEYWORD: - # Code intent: return only code files + unknown (might be code) - filtered = code_results + unknown_results - elif intent == QueryIntent.SEMANTIC: - if allow_mixed: - # Semantic intent with mixed: docs first, then code - filtered = doc_results + code_results + unknown_results - else: - # Semantic intent strict: only docs - filtered = doc_results + unknown_results - else: - filtered = results - - return filtered - - -def simple_weighted_fusion( - results_map: Dict[str, List[SearchResult]], - weights: Dict[str, float] = None, -) -> List[SearchResult]: - """Combine search results using simple weighted sum of normalized scores. - - This is an alternative to RRF that preserves score magnitude information. - Scores are min-max normalized per source before weighted combination. - - Formula: score(d) = Σ weight_source * normalized_score_source(d) - - Args: - results_map: Dictionary mapping source name to list of SearchResult objects - Sources: 'exact', 'fuzzy', 'vector', 'splade' - weights: Dictionary mapping source name to weight (default: equal weights) - Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6} - - Returns: - List of SearchResult objects sorted by fused score (descending) - - Examples: - >>> fts_results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - >>> vector_results = [SearchResult(path="b.py", score=0.85, excerpt="...")] - >>> results_map = {'exact': fts_results, 'vector': vector_results} - >>> fused = simple_weighted_fusion(results_map) - """ - if not results_map: - return [] - - # Default equal weights if not provided - if weights is None: - num_sources = len(results_map) - weights = {source: 1.0 / num_sources for source in results_map} - - # Normalize weights to sum to 1.0 - weight_sum = sum(weights.values()) - if not math.isclose(weight_sum, 1.0, abs_tol=0.01) and weight_sum > 0: - weights = {source: w / weight_sum for source, w in weights.items()} - - # Compute min-max normalization parameters per source - source_stats: Dict[str, tuple] = {} - for source_name, results in results_map.items(): - if not results: - continue - scores = [r.score for r in results] - min_s, max_s = min(scores), max(scores) - source_stats[source_name] = (min_s, max_s) - - def normalize_score(score: float, source: str) -> float: - """Normalize score to [0, 1] range using min-max scaling.""" - if source not in source_stats: - return 0.0 - min_s, max_s = source_stats[source] - if max_s == min_s: - return 1.0 if score >= min_s else 0.0 - return (score - min_s) / (max_s - min_s) - - # Build unified result set with weighted scores - path_to_result: Dict[str, SearchResult] = {} - path_to_fusion_score: Dict[str, float] = {} - path_to_source_scores: Dict[str, Dict[str, float]] = {} - - for source_name, results in results_map.items(): - weight = weights.get(source_name, 0.0) - if weight == 0: - continue - - for result in results: - path = result.path - normalized = normalize_score(result.score, source_name) - contribution = weight * normalized - - if path not in path_to_fusion_score: - path_to_fusion_score[path] = 0.0 - path_to_result[path] = result - path_to_source_scores[path] = {} - - path_to_fusion_score[path] += contribution - path_to_source_scores[path][source_name] = normalized - - # Create final results with fusion scores - fused_results = [] - for path, base_result in path_to_result.items(): - fusion_score = path_to_fusion_score[path] - - fused_result = SearchResult( - path=base_result.path, - score=fusion_score, - excerpt=base_result.excerpt, - content=base_result.content, - symbol=base_result.symbol, - chunk=base_result.chunk, - metadata={ - **base_result.metadata, - "fusion_method": "simple_weighted", - "fusion_score": fusion_score, - "original_score": base_result.score, - "source_scores": path_to_source_scores[path], - }, - start_line=base_result.start_line, - end_line=base_result.end_line, - symbol_name=base_result.symbol_name, - symbol_kind=base_result.symbol_kind, - ) - fused_results.append(fused_result) - - fused_results.sort(key=lambda r: r.score, reverse=True) - return fused_results - - -def reciprocal_rank_fusion( - results_map: Dict[str, List[SearchResult]], - weights: Dict[str, float] = None, - k: int = 60, -) -> List[SearchResult]: - """Combine search results from multiple sources using Reciprocal Rank Fusion. - - RRF formula: score(d) = Σ weight_source / (k + rank_source(d)) - - Supports three-way fusion with FTS, Vector, and SPLADE sources. - - Args: - results_map: Dictionary mapping source name to list of SearchResult objects - Sources: 'exact', 'fuzzy', 'vector', 'splade' - weights: Dictionary mapping source name to weight (default: equal weights) - Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6} - Or: {'splade': 0.4, 'vector': 0.6} - k: Constant to avoid division by zero and control rank influence (default 60) - - Returns: - List of SearchResult objects sorted by fused score (descending) - - Examples: - >>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - >>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")] - >>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results} - >>> fused = reciprocal_rank_fusion(results_map) - - # Three-way fusion with SPLADE - >>> results_map = { - ... 'exact': exact_results, - ... 'vector': vector_results, - ... 'splade': splade_results - ... } - >>> fused = reciprocal_rank_fusion(results_map, k=60) - """ - if not results_map: - return [] - - # Default equal weights if not provided - if weights is None: - num_sources = len(results_map) - weights = {source: 1.0 / num_sources for source in results_map} - - # Validate weights sum to 1.0 - weight_sum = sum(weights.values()) - if not math.isclose(weight_sum, 1.0, abs_tol=0.01): - # Normalize weights to sum to 1.0 - weights = {source: w / weight_sum for source, w in weights.items()} - - # Build unified result set with RRF scores - path_to_result: Dict[str, SearchResult] = {} - path_to_fusion_score: Dict[str, float] = {} - path_to_source_ranks: Dict[str, Dict[str, int]] = {} - - for source_name, results in results_map.items(): - weight = weights.get(source_name, 0.0) - if weight == 0: - continue - - for rank, result in enumerate(results, start=1): - path = result.path - rrf_contribution = weight / (k + rank) - - # Initialize or accumulate fusion score - if path not in path_to_fusion_score: - path_to_fusion_score[path] = 0.0 - path_to_result[path] = result - path_to_source_ranks[path] = {} - - path_to_fusion_score[path] += rrf_contribution - path_to_source_ranks[path][source_name] = rank - - # Create final results with fusion scores - fused_results = [] - for path, base_result in path_to_result.items(): - fusion_score = path_to_fusion_score[path] - - # Create new SearchResult with fusion_score in metadata - fused_result = SearchResult( - path=base_result.path, - score=fusion_score, - excerpt=base_result.excerpt, - content=base_result.content, - symbol=base_result.symbol, - chunk=base_result.chunk, - metadata={ - **base_result.metadata, - "fusion_method": "rrf", - "fusion_score": fusion_score, - "original_score": base_result.score, - "rrf_k": k, - "source_ranks": path_to_source_ranks[path], - }, - start_line=base_result.start_line, - end_line=base_result.end_line, - symbol_name=base_result.symbol_name, - symbol_kind=base_result.symbol_kind, - ) - fused_results.append(fused_result) - - # Sort by fusion score descending - fused_results.sort(key=lambda r: r.score, reverse=True) - - return fused_results - - -def apply_symbol_boost( - results: List[SearchResult], - boost_factor: float = 1.5, -) -> List[SearchResult]: - """Boost fused scores for results that include an explicit symbol match. - - The boost is multiplicative on the current result.score (typically the RRF fusion score). - When boosted, the original score is preserved in metadata["original_fusion_score"] and - metadata["boosted"] is set to True. - """ - if not results: - return [] - - if boost_factor <= 1.0: - # Still return new objects to follow immutable transformation pattern. - return [ - SearchResult( - path=r.path, - score=r.score, - excerpt=r.excerpt, - content=r.content, - symbol=r.symbol, - chunk=r.chunk, - metadata={**r.metadata}, - start_line=r.start_line, - end_line=r.end_line, - symbol_name=r.symbol_name, - symbol_kind=r.symbol_kind, - additional_locations=list(r.additional_locations), - ) - for r in results - ] - - boosted_results: List[SearchResult] = [] - for result in results: - has_symbol = bool(result.symbol_name) - original_score = float(result.score) - boosted_score = original_score * boost_factor if has_symbol else original_score - - metadata = {**result.metadata} - if has_symbol: - metadata.setdefault("original_fusion_score", metadata.get("fusion_score", original_score)) - metadata["boosted"] = True - metadata["symbol_boost_factor"] = boost_factor - - boosted_results.append( - SearchResult( - path=result.path, - score=boosted_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata=metadata, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - boosted_results.sort(key=lambda r: r.score, reverse=True) - return boosted_results - - -def rerank_results( - query: str, - results: List[SearchResult], - embedder: Any, - top_k: int = 50, -) -> List[SearchResult]: - """Re-rank results with embedding cosine similarity, combined with current score. - - Combined score formula: - 0.5 * rrf_score + 0.5 * cosine_similarity - - If embedder is None or embedding fails, returns results as-is. - """ - if not results: - return [] - - if embedder is None or top_k <= 0: - return results - - rerank_count = min(int(top_k), len(results)) - - def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float: - # Defensive: handle mismatched lengths and zero vectors. - n = min(len(vec_a), len(vec_b)) - if n == 0: - return 0.0 - dot = 0.0 - norm_a = 0.0 - norm_b = 0.0 - for i in range(n): - a = float(vec_a[i]) - b = float(vec_b[i]) - dot += a * b - norm_a += a * a - norm_b += b * b - if norm_a <= 0.0 or norm_b <= 0.0: - return 0.0 - sim = dot / (math.sqrt(norm_a) * math.sqrt(norm_b)) - # SearchResult.score requires non-negative scores; clamp cosine similarity to [0, 1]. - return max(0.0, min(1.0, sim)) - - def text_for_embedding(r: SearchResult) -> str: - if r.excerpt and r.excerpt.strip(): - return r.excerpt - if r.content and r.content.strip(): - return r.content - if r.chunk and r.chunk.content and r.chunk.content.strip(): - return r.chunk.content - # Fallback: stable, non-empty text. - return r.symbol_name or r.path - - try: - if hasattr(embedder, "embed_single"): - query_vec = embedder.embed_single(query) - else: - query_vec = embedder.embed(query)[0] - - doc_texts = [text_for_embedding(r) for r in results[:rerank_count]] - doc_vecs = embedder.embed(doc_texts) - except Exception: - return results - - reranked_results: List[SearchResult] = [] - - for idx, result in enumerate(results): - if idx < rerank_count: - rrf_score = float(result.score) - sim = cosine_similarity(query_vec, doc_vecs[idx]) - combined_score = 0.5 * rrf_score + 0.5 * sim - - reranked_results.append( - SearchResult( - path=result.path, - score=combined_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={ - **result.metadata, - "rrf_score": rrf_score, - "cosine_similarity": sim, - "reranked": True, - }, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - else: - # Preserve remaining results without re-ranking, but keep immutability. - reranked_results.append( - SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - reranked_results.sort(key=lambda r: r.score, reverse=True) - return reranked_results - - -def cross_encoder_rerank( - query: str, - results: List[SearchResult], - reranker: Any, - top_k: int = 50, - batch_size: int = 32, - chunk_type_weights: Optional[Dict[str, float]] = None, - test_file_penalty: float = 0.0, -) -> List[SearchResult]: - """Second-stage reranking using a cross-encoder model. - - This function is dependency-agnostic: callers can pass any object that exposes - a compatible `score_pairs(pairs, batch_size=...)` method. - - Args: - query: Search query string - results: List of search results to rerank - reranker: Cross-encoder model with score_pairs or predict method - top_k: Number of top results to rerank - batch_size: Batch size for reranking - chunk_type_weights: Optional weights for different chunk types. - Example: {"code": 1.0, "docstring": 0.7} - reduce docstring influence - test_file_penalty: Penalty applied to test files (0.0-1.0). - Example: 0.2 means test files get 20% score reduction - """ - if not results: - return [] - - if reranker is None or top_k <= 0: - return results - - rerank_count = min(int(top_k), len(results)) - - def text_for_pair(r: SearchResult) -> str: - if r.excerpt and r.excerpt.strip(): - return r.excerpt - if r.content and r.content.strip(): - return r.content - if r.chunk and r.chunk.content and r.chunk.content.strip(): - return r.chunk.content - return r.symbol_name or r.path - - pairs = [(query, text_for_pair(r)) for r in results[:rerank_count]] - - try: - if hasattr(reranker, "score_pairs"): - raw_scores = reranker.score_pairs(pairs, batch_size=int(batch_size)) - elif hasattr(reranker, "predict"): - raw_scores = reranker.predict(pairs, batch_size=int(batch_size)) - else: - return results - except Exception: - return results - - if not raw_scores or len(raw_scores) != rerank_count: - return results - - scores = [float(s) for s in raw_scores] - min_s = min(scores) - max_s = max(scores) - - def sigmoid(x: float) -> float: - # Clamp to keep exp() stable. - x = max(-50.0, min(50.0, x)) - return 1.0 / (1.0 + math.exp(-x)) - - if 0.0 <= min_s and max_s <= 1.0: - probs = scores - else: - probs = [sigmoid(s) for s in scores] - - reranked_results: List[SearchResult] = [] - - # Helper to detect test files - def is_test_file(path: str) -> bool: - if not path: - return False - basename = path.split("/")[-1].split("\\")[-1] - return ( - basename.startswith("test_") or - basename.endswith("_test.py") or - basename.endswith(".test.ts") or - basename.endswith(".test.js") or - basename.endswith(".spec.ts") or - basename.endswith(".spec.js") or - "/tests/" in path or - "\\tests\\" in path or - "/test/" in path or - "\\test\\" in path - ) - - for idx, result in enumerate(results): - if idx < rerank_count: - prev_score = float(result.score) - ce_score = scores[idx] - ce_prob = probs[idx] - - # Base combined score - combined_score = 0.5 * prev_score + 0.5 * ce_prob - - # Apply chunk_type weight adjustment - if chunk_type_weights: - chunk_type = None - if result.chunk and hasattr(result.chunk, "metadata"): - chunk_type = result.chunk.metadata.get("chunk_type") - elif result.metadata: - chunk_type = result.metadata.get("chunk_type") - - if chunk_type and chunk_type in chunk_type_weights: - weight = chunk_type_weights[chunk_type] - # Apply weight to CE contribution only - combined_score = 0.5 * prev_score + 0.5 * ce_prob * weight - - # Apply test file penalty - if test_file_penalty > 0 and is_test_file(result.path): - combined_score = combined_score * (1.0 - test_file_penalty) - - reranked_results.append( - SearchResult( - path=result.path, - score=combined_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={ - **result.metadata, - "pre_cross_encoder_score": prev_score, - "cross_encoder_score": ce_score, - "cross_encoder_prob": ce_prob, - "cross_encoder_reranked": True, - }, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - else: - reranked_results.append( - SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - reranked_results.sort(key=lambda r: r.score, reverse=True) - return reranked_results - - -def normalize_bm25_score(score: float) -> float: - """Normalize BM25 scores from SQLite FTS5 to 0-1 range. - - SQLite FTS5 returns negative BM25 scores (more negative = better match). - Uses sigmoid transformation for normalization. - - Args: - score: Raw BM25 score from SQLite (typically negative) - - Returns: - Normalized score in range [0, 1] - - Examples: - >>> normalize_bm25_score(-10.5) # Good match - 0.85 - >>> normalize_bm25_score(-1.2) # Weak match - 0.62 - """ - # Take absolute value (BM25 is negative in SQLite) - abs_score = abs(score) - - # Sigmoid transformation: 1 / (1 + e^(-x)) - # Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1) - normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1)) - - return normalized - - -def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]: - """Tag search results with their source for RRF tracking. - - Args: - results: List of SearchResult objects - source: Source identifier ('exact', 'fuzzy', 'vector') - - Returns: - List of SearchResult objects with 'search_source' in metadata - """ - tagged_results = [] - for result in results: - tagged_result = SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata, "search_source": source}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - ) - tagged_results.append(tagged_result) - - return tagged_results - - -def group_similar_results( - results: List[SearchResult], - score_threshold_abs: float = 0.01, - content_field: str = "excerpt" -) -> List[SearchResult]: - """Group search results by content and score similarity. - - Groups results that have similar content and similar scores into a single - representative result, with other locations stored in additional_locations. - - Algorithm: - 1. Group results by content (using excerpt or content field) - 2. Within each content group, create subgroups based on score similarity - 3. Select highest-scoring result as representative for each subgroup - 4. Store other results in subgroup as additional_locations - - Args: - results: A list of SearchResult objects (typically sorted by score) - score_threshold_abs: Absolute score difference to consider results similar. - Results with |score_a - score_b| <= threshold are grouped. - Default 0.01 is suitable for RRF fusion scores. - content_field: The field to use for content grouping ('excerpt' or 'content') - - Returns: - A new list of SearchResult objects where similar items are grouped. - The list is sorted by score descending. - - Examples: - >>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"), - ... SearchResult(path="b.py", score=0.5, excerpt="def foo()")] - >>> grouped = group_similar_results(results) - >>> len(grouped) # Two results merged into one - 1 - >>> len(grouped[0].additional_locations) # One additional location - 1 - """ - if not results: - return [] - - # Group results by content - content_map: Dict[str, List[SearchResult]] = {} - unidentifiable_results: List[SearchResult] = [] - - for r in results: - key = getattr(r, content_field, None) - if key and key.strip(): - content_map.setdefault(key, []).append(r) - else: - # Results without content can't be grouped by content - unidentifiable_results.append(r) - - final_results: List[SearchResult] = [] - - # Process each content group - for content_group in content_map.values(): - # Sort by score descending within group - content_group.sort(key=lambda r: r.score, reverse=True) - - while content_group: - # Take highest scoring as representative - representative = content_group.pop(0) - others_in_group = [] - remaining_for_next_pass = [] - - # Find results with similar scores - for item in content_group: - if abs(representative.score - item.score) <= score_threshold_abs: - others_in_group.append(item) - else: - remaining_for_next_pass.append(item) - - # Create grouped result with additional locations - if others_in_group: - # Build new result with additional_locations populated - grouped_result = SearchResult( - path=representative.path, - score=representative.score, - excerpt=representative.excerpt, - content=representative.content, - symbol=representative.symbol, - chunk=representative.chunk, - metadata={ - **representative.metadata, - "grouped_count": len(others_in_group) + 1, - }, - start_line=representative.start_line, - end_line=representative.end_line, - symbol_name=representative.symbol_name, - symbol_kind=representative.symbol_kind, - additional_locations=[ - AdditionalLocation( - path=other.path, - score=other.score, - start_line=other.start_line, - end_line=other.end_line, - symbol_name=other.symbol_name, - ) for other in others_in_group - ], - ) - final_results.append(grouped_result) - else: - final_results.append(representative) - - content_group = remaining_for_next_pass - - # Add ungroupable results - final_results.extend(unidentifiable_results) - - # Sort final results by score descending - final_results.sort(key=lambda r: r.score, reverse=True) - - return final_results diff --git a/codex-lens/build/lib/codexlens/semantic/__init__.py b/codex-lens/build/lib/codexlens/semantic/__init__.py deleted file mode 100644 index b9bd040f..00000000 --- a/codex-lens/build/lib/codexlens/semantic/__init__.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Optional semantic search module for CodexLens. - -Install with: pip install codexlens[semantic] -Uses fastembed (ONNX-based, lightweight ~200MB) - -GPU Acceleration: -- Automatic GPU detection and usage when available -- Supports CUDA (NVIDIA), TensorRT, DirectML (Windows), ROCm (AMD), CoreML (Apple) -- Install GPU support: pip install onnxruntime-gpu (NVIDIA) or onnxruntime-directml (Windows) -""" - -from __future__ import annotations - -SEMANTIC_AVAILABLE = False -SEMANTIC_BACKEND: str | None = None -GPU_AVAILABLE = False -LITELLM_AVAILABLE = False -_import_error: str | None = None - - -def _detect_backend() -> tuple[bool, str | None, bool, str | None]: - """Detect if fastembed and GPU are available.""" - try: - import numpy as np - except ImportError as e: - return False, None, False, f"numpy not available: {e}" - - try: - from fastembed import TextEmbedding - except ImportError: - return False, None, False, "fastembed not available. Install with: pip install codexlens[semantic]" - - # Check GPU availability - gpu_available = False - try: - from .gpu_support import is_gpu_available - gpu_available = is_gpu_available() - except ImportError: - pass - - return True, "fastembed", gpu_available, None - - -# Initialize on module load -SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, GPU_AVAILABLE, _import_error = _detect_backend() - - -def check_semantic_available() -> tuple[bool, str | None]: - """Check if semantic search dependencies are available.""" - return SEMANTIC_AVAILABLE, _import_error - - -def check_gpu_available() -> tuple[bool, str]: - """Check if GPU acceleration is available. - - Returns: - Tuple of (is_available, status_message) - """ - if not SEMANTIC_AVAILABLE: - return False, "Semantic search not available" - - try: - from .gpu_support import is_gpu_available, get_gpu_summary - if is_gpu_available(): - return True, get_gpu_summary() - return False, "No GPU detected (using CPU)" - except ImportError: - return False, "GPU support module not available" - - -# Export embedder components -# BaseEmbedder is always available (abstract base class) -from .base import BaseEmbedder - -# Factory function for creating embedders -from .factory import get_embedder as get_embedder_factory - -# Optional: LiteLLMEmbedderWrapper (only if ccw-litellm is installed) -try: - import ccw_litellm # noqa: F401 - from .litellm_embedder import LiteLLMEmbedderWrapper - LITELLM_AVAILABLE = True -except ImportError: - LiteLLMEmbedderWrapper = None - LITELLM_AVAILABLE = False - - -def is_embedding_backend_available(backend: str) -> tuple[bool, str | None]: - """Check whether a specific embedding backend can be used. - - Notes: - - "fastembed" requires the optional semantic deps (pip install codexlens[semantic]). - - "litellm" requires ccw-litellm to be installed in the same environment. - """ - backend = (backend or "").strip().lower() - if backend == "fastembed": - if SEMANTIC_AVAILABLE: - return True, None - return False, _import_error or "fastembed not available. Install with: pip install codexlens[semantic]" - if backend == "litellm": - if LITELLM_AVAILABLE: - return True, None - return False, "ccw-litellm not available. Install with: pip install ccw-litellm" - return False, f"Invalid embedding backend: {backend}. Must be 'fastembed' or 'litellm'." - - -__all__ = [ - "SEMANTIC_AVAILABLE", - "SEMANTIC_BACKEND", - "GPU_AVAILABLE", - "LITELLM_AVAILABLE", - "check_semantic_available", - "is_embedding_backend_available", - "check_gpu_available", - "BaseEmbedder", - "get_embedder_factory", - "LiteLLMEmbedderWrapper", -] diff --git a/codex-lens/build/lib/codexlens/semantic/ann_index.py b/codex-lens/build/lib/codexlens/semantic/ann_index.py deleted file mode 100644 index 0d10e742..00000000 --- a/codex-lens/build/lib/codexlens/semantic/ann_index.py +++ /dev/null @@ -1,1068 +0,0 @@ -"""Approximate Nearest Neighbor (ANN) index using HNSW algorithm. - -Provides O(log N) similarity search using hnswlib's Hierarchical Navigable Small World graphs. -Falls back to brute-force search when hnswlib is not available. - -Key features: -- HNSW index for fast approximate nearest neighbor search -- Persistent index storage (saved alongside SQLite database) -- Incremental vector addition and deletion -- Thread-safe operations -- Cosine similarity metric -- Support for centralized storage mode (single index at project root) -""" - -from __future__ import annotations - -import logging -import threading -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.errors import StorageError -from codexlens.config import VECTORS_HNSW_NAME - -from . import SEMANTIC_AVAILABLE - -if SEMANTIC_AVAILABLE: - import numpy as np - -logger = logging.getLogger(__name__) - -# Try to import hnswlib (optional dependency) -try: - import hnswlib - - HNSWLIB_AVAILABLE = True -except ImportError: - HNSWLIB_AVAILABLE = False - - -class ANNIndex: - """HNSW-based approximate nearest neighbor index for vector similarity search. - - Performance characteristics: - - Build time: O(N log N) where N is number of vectors - - Search time: O(log N) approximate - - Memory: ~(M * 2 * 4 * d) bytes per vector (M=16, d=dimension) - - Index parameters: - - space: cosine (cosine similarity metric) - - M: 16 (max connections per node - balance between speed and recall) - - ef_construction: 200 (search width during build - higher = better quality) - - ef: 50 (search width during query - higher = better recall) - """ - - def __init__( - self, - index_path: Path, - dim: int, - initial_capacity: int = 50000, - auto_save: bool = False, - expansion_threshold: float = 0.8, - ) -> None: - """Initialize ANN index. - - Args: - index_path: Path to SQLite database (index will be saved as _vectors.hnsw) - dim: Dimension of embedding vectors - initial_capacity: Initial maximum elements capacity (default: 50000) - auto_save: Whether to automatically save index after operations (default: False) - expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8) - - Raises: - ImportError: If required dependencies are not available - ValueError: If dimension or capacity is invalid - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - if not HNSWLIB_AVAILABLE: - raise ImportError( - "hnswlib is required for ANN index. " - "Install with: pip install hnswlib" - ) - - if dim <= 0: - raise ValueError(f"Invalid dimension: {dim}") - - if initial_capacity <= 0: - raise ValueError(f"Invalid initial capacity: {initial_capacity}") - - if not 0.0 < expansion_threshold < 1.0: - raise ValueError( - f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1." - ) - - self.index_path = Path(index_path) - self.dim = dim - - # Derive HNSW index path from database path - # e.g., /path/to/_index.db -> /path/to/_index_vectors.hnsw - # This ensures unique HNSW files for each database - db_stem = self.index_path.stem # e.g., "_index" or "tmp123" - self.hnsw_path = self.index_path.parent / f"{db_stem}_vectors.hnsw" - - # HNSW parameters - self.space = "cosine" # Cosine similarity metric - self.M = 16 # Max connections per node (16 is good balance) - self.ef_construction = 200 # Build-time search width (higher = better quality) - self.ef = 50 # Query-time search width (higher = better recall) - - # Memory management parameters - self._auto_save = auto_save - self._expansion_threshold = expansion_threshold - - # Thread safety - self._lock = threading.RLock() - - # HNSW index instance - self._index: Optional[hnswlib.Index] = None - self._max_elements = initial_capacity # Initial capacity (reduced from 1M to 50K) - self._current_count = 0 # Track number of vectors - - logger.info( - f"Initialized ANNIndex with capacity={initial_capacity}, " - f"auto_save={auto_save}, expansion_threshold={expansion_threshold}" - ) - - @classmethod - def create_central( - cls, - index_root: Path, - dim: int, - initial_capacity: int = 50000, - auto_save: bool = False, - expansion_threshold: float = 0.8, - ) -> "ANNIndex": - """Create a centralized ANN index at the project index root. - - This method creates a single shared HNSW index file at the project root, - rather than per-directory indexes. Use this for projects that want all - dense vectors stored in one central location. - - Args: - index_root: Root directory for the index (e.g., .codexlens//) - dim: Dimension of embedding vectors - initial_capacity: Initial maximum elements capacity (default: 50000) - auto_save: Whether to automatically save index after operations (default: False) - expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8) - - Returns: - ANNIndex instance configured for centralized storage - - Example: - >>> index = ANNIndex.create_central(Path(".codexlens/abc123"), dim=768) - >>> index.hnsw_path # Returns: .codexlens/abc123/_vectors.hnsw - """ - # Create a dummy index_path that will result in the central hnsw_path - # The index_path is used to derive hnsw_path, so we create a virtual path - # such that self.hnsw_path = index_root / VECTORS_HNSW_NAME - instance = cls.__new__(cls) - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - if not HNSWLIB_AVAILABLE: - raise ImportError( - "hnswlib is required for ANN index. " - "Install with: pip install hnswlib" - ) - - if dim <= 0: - raise ValueError(f"Invalid dimension: {dim}") - - if initial_capacity <= 0: - raise ValueError(f"Invalid initial capacity: {initial_capacity}") - - if not 0.0 < expansion_threshold < 1.0: - raise ValueError( - f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1." - ) - - instance.index_path = index_root - instance.dim = dim - - # Centralized mode: use VECTORS_HNSW_NAME directly at index_root - instance.hnsw_path = index_root / VECTORS_HNSW_NAME - - # HNSW parameters - instance.space = "cosine" - instance.M = 16 - instance.ef_construction = 200 - instance.ef = 50 - - # Memory management parameters - instance._auto_save = auto_save - instance._expansion_threshold = expansion_threshold - - # Thread safety - instance._lock = threading.RLock() - - # HNSW index instance - instance._index: Optional[hnswlib.Index] = None - instance._max_elements = initial_capacity - instance._current_count = 0 - - logger.info( - f"Initialized centralized ANNIndex at {instance.hnsw_path} with " - f"capacity={initial_capacity}, auto_save={auto_save}" - ) - - return instance - - def _ensure_index(self) -> None: - """Ensure HNSW index is initialized (lazy initialization).""" - if self._index is None: - self._index = hnswlib.Index(space=self.space, dim=self.dim) - self._index.init_index( - max_elements=self._max_elements, - ef_construction=self.ef_construction, - M=self.M, - ) - self._index.set_ef(self.ef) - self._current_count = 0 - logger.debug(f"Created new HNSW index with capacity {self._max_elements}") - - def _auto_expand_if_needed(self, additional_count: int) -> None: - """Auto-expand index capacity if threshold is reached. - - Args: - additional_count: Number of vectors to be added - - Note: - This is called internally by add_vectors and is thread-safe. - """ - usage_ratio = (self._current_count + additional_count) / self._max_elements - - if usage_ratio >= self._expansion_threshold: - # Calculate new capacity (2x current or enough to fit new vectors) - new_capacity = max( - self._max_elements * 2, - self._current_count + additional_count, - ) - - logger.info( - f"Expanding index capacity: {self._max_elements} -> {new_capacity} " - f"(usage: {usage_ratio:.1%}, threshold: {self._expansion_threshold:.1%})" - ) - - self._index.resize_index(new_capacity) - self._max_elements = new_capacity - - def add_vectors(self, ids: List[int], vectors: np.ndarray) -> None: - """Add vectors to the index. - - Args: - ids: List of vector IDs (must be unique) - vectors: Numpy array of shape (N, dim) where N = len(ids) - - Raises: - ValueError: If shapes don't match or vectors are invalid - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if vectors.shape[0] != len(ids): - raise ValueError( - f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})" - ) - - if vectors.shape[1] != self.dim: - raise ValueError( - f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})" - ) - - with self._lock: - try: - self._ensure_index() - - # Auto-expand if threshold reached - self._auto_expand_if_needed(len(ids)) - - # Ensure vectors are C-contiguous float32 (hnswlib requirement) - if not vectors.flags['C_CONTIGUOUS'] or vectors.dtype != np.float32: - vectors = np.ascontiguousarray(vectors, dtype=np.float32) - - # Add vectors to index - self._index.add_items(vectors, ids) - self._current_count += len(ids) - - logger.debug( - f"Added {len(ids)} vectors to index " - f"(total: {self._current_count}/{self._max_elements})" - ) - - # Auto-save if enabled - if self._auto_save: - self.save() - - except Exception as e: - raise StorageError(f"Failed to add vectors to ANN index: {e}") - - def remove_vectors(self, ids: List[int]) -> None: - """Remove vectors from the index by marking them as deleted. - - Note: hnswlib uses soft deletion (mark_deleted). Vectors are not - physically removed but will be excluded from search results. - - Args: - ids: List of vector IDs to remove - - Raises: - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - with self._lock: - try: - if self._index is None or self._current_count == 0: - return # Nothing to remove - - # Mark vectors as deleted - deleted_count = 0 - for vec_id in ids: - try: - self._index.mark_deleted(vec_id) - deleted_count += 1 - except RuntimeError: - # ID not found - ignore (idempotent deletion) - pass - - logger.debug(f"Marked {deleted_count}/{len(ids)} vectors as deleted") - - # Auto-save if enabled - if self._auto_save and deleted_count > 0: - self.save() - - except Exception as e: - raise StorageError(f"Failed to remove vectors from ANN index: {e}") - - def search( - self, query: np.ndarray, top_k: int = 10 - ) -> Tuple[List[int], List[float]]: - """Search for nearest neighbors. - - Args: - query: Query vector of shape (dim,) or (1, dim) - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) where: - - ids: List of vector IDs ordered by similarity - - distances: List of cosine distances (lower = more similar) - - Raises: - ValueError: If query shape is invalid - StorageError: If search operation fails - """ - # Validate query shape - if query.ndim == 1: - query = query.reshape(1, -1) - - if query.shape[0] != 1: - raise ValueError( - f"Query must be a single vector, got shape {query.shape}" - ) - - if query.shape[1] != self.dim: - raise ValueError( - f"Query dimension ({query.shape[1]}) must match index dimension ({self.dim})" - ) - - with self._lock: - try: - if self._index is None or self._current_count == 0: - return [], [] # Empty index - - # Perform kNN search - labels, distances = self._index.knn_query(query, k=top_k) - - # Convert to lists and flatten (knn_query returns 2D arrays) - ids = labels[0].tolist() - dists = distances[0].tolist() - - return ids, dists - - except Exception as e: - raise StorageError(f"Failed to search ANN index: {e}") - - def save(self) -> None: - """Save index to disk. - - Index is saved to [db_path_directory]/_vectors.hnsw - - Raises: - StorageError: If save operation fails - """ - with self._lock: - try: - if self._index is None or self._current_count == 0: - logger.debug("Skipping save: index is empty") - return # Nothing to save - - # Ensure parent directory exists - self.hnsw_path.parent.mkdir(parents=True, exist_ok=True) - - # Save index - self._index.save_index(str(self.hnsw_path)) - - logger.debug( - f"Saved index to {self.hnsw_path} " - f"({self._current_count} vectors, capacity: {self._max_elements})" - ) - - except Exception as e: - raise StorageError(f"Failed to save ANN index: {e}") - - def load(self) -> bool: - """Load index from disk. - - Returns: - True if index was loaded successfully, False if index file doesn't exist - - Raises: - StorageError: If load operation fails - """ - with self._lock: - try: - if not self.hnsw_path.exists(): - logger.debug(f"Index file not found: {self.hnsw_path}") - return False # Index file doesn't exist (not an error) - - # Create fresh index object for loading (don't call init_index first) - self._index = hnswlib.Index(space=self.space, dim=self.dim) - - # Load index from disk - # Note: max_elements here is just for initial allocation, can expand later - self._index.load_index(str(self.hnsw_path), max_elements=self._max_elements) - - # Update count and capacity from loaded index - self._current_count = self._index.get_current_count() - self._max_elements = self._index.get_max_elements() - - # Set query-time ef parameter - self._index.set_ef(self.ef) - - logger.info( - f"Loaded index from {self.hnsw_path} " - f"({self._current_count} vectors, capacity: {self._max_elements})" - ) - - return True - - except Exception as e: - raise StorageError(f"Failed to load ANN index: {e}") - - def count(self) -> int: - """Get number of vectors in the index. - - Returns: - Number of vectors currently in the index - """ - with self._lock: - return self._current_count - - @property - def capacity(self) -> int: - """Get current maximum capacity of the index. - - Returns: - Maximum number of vectors the index can hold before expansion - """ - with self._lock: - return self._max_elements - - @property - def usage_ratio(self) -> float: - """Get current usage ratio (count / capacity). - - Returns: - Usage ratio between 0.0 and 1.0 - """ - with self._lock: - if self._max_elements == 0: - return 0.0 - return self._current_count / self._max_elements - - @property - def is_loaded(self) -> bool: - """Check if index is loaded and ready for use. - - Returns: - True if index is loaded, False otherwise - """ - with self._lock: - return self._index is not None and self._current_count > 0 - - - -class BinaryANNIndex: - """Binary vector ANN index using Hamming distance for fast coarse retrieval. - - .. deprecated:: - This class is deprecated. Use :class:`codexlens.search.binary_searcher.BinarySearcher` - instead, which provides faster memory-mapped search with centralized storage. - - Optimized for binary vectors (256-bit / 32 bytes per vector). - Uses packed binary representation for memory efficiency. - - Performance characteristics: - - Storage: 32 bytes per vector (vs ~8KB for dense vectors) - - Distance: Hamming distance via XOR + popcount (CPU-efficient) - - Search: O(N) brute-force with SIMD-accelerated distance computation - - Index parameters: - - dim: Binary vector dimension (default: 256) - - packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit) - - Usage: - index = BinaryANNIndex(index_path, dim=256) - index.add_vectors([1, 2, 3], packed_vectors) # List of 32-byte packed vectors - ids, distances = index.search(query_packed, top_k=10) - """ - - DEFAULT_DIM = 256 # Default binary vector dimension - - def __init__( - self, - index_path: Path, - dim: int = 256, - initial_capacity: int = 100000, - auto_save: bool = False, - ) -> None: - """Initialize Binary ANN index. - - Args: - index_path: Path to database (index will be saved as _binary_vectors.bin) - dim: Dimension of binary vectors (default: 256) - initial_capacity: Initial capacity hint (default: 100000) - auto_save: Whether to automatically save index after operations - - Raises: - ImportError: If required dependencies are not available - ValueError: If dimension is invalid - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - import warnings - warnings.warn( - "BinaryANNIndex is deprecated. Use codexlens.search.binary_searcher.BinarySearcher " - "instead for faster memory-mapped search with centralized storage.", - DeprecationWarning, - stacklevel=2 - ) - - if dim <= 0 or dim % 8 != 0: - raise ValueError( - f"Invalid dimension: {dim}. Must be positive and divisible by 8." - ) - - self.index_path = Path(index_path) - self.dim = dim - self.packed_dim = dim // 8 # 32 bytes for 256-bit vectors - - # Derive binary index path from database path - db_stem = self.index_path.stem - self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin" - - # Memory management - self._auto_save = auto_save - self._initial_capacity = initial_capacity - - # Thread safety - self._lock = threading.RLock() - - # In-memory storage: id -> packed binary vector - self._vectors: dict[int, bytes] = {} - self._id_list: list[int] = [] # Ordered list for efficient iteration - - # Cached numpy array for vectorized search (invalidated on add/remove) - self._vectors_matrix: Optional[np.ndarray] = None - self._ids_array: Optional[np.ndarray] = None - self._cache_valid: bool = False - - logger.info( - f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}" - ) - - def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None: - """Add packed binary vectors to the index. - - Args: - ids: List of vector IDs (must be unique) - vectors: List of packed binary vectors (each of size packed_dim bytes) - - Raises: - ValueError: If shapes don't match or vectors are invalid - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if len(vectors) != len(ids): - raise ValueError( - f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})" - ) - - # Validate vector sizes - for i, vec in enumerate(vectors): - if len(vec) != self.packed_dim: - raise ValueError( - f"Vector {i} has size {len(vec)}, expected {self.packed_dim}" - ) - - with self._lock: - try: - for vec_id, vec in zip(ids, vectors): - if vec_id not in self._vectors: - self._id_list.append(vec_id) - self._vectors[vec_id] = vec - - # Invalidate cache on modification - self._cache_valid = False - - logger.debug( - f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})" - ) - - if self._auto_save: - self.save() - - except Exception as e: - raise StorageError(f"Failed to add vectors to Binary ANN index: {e}") - - def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None: - """Add unpacked binary vectors (0/1 values) to the index. - - Convenience method that packs the vectors before adding. - - Args: - ids: List of vector IDs (must be unique) - vectors: Numpy array of shape (N, dim) with binary values (0 or 1) - - Raises: - ValueError: If shapes don't match - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if vectors.shape[0] != len(ids): - raise ValueError( - f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})" - ) - - if vectors.shape[1] != self.dim: - raise ValueError( - f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})" - ) - - # Pack vectors - packed_vectors = [] - for i in range(vectors.shape[0]): - packed = np.packbits(vectors[i].astype(np.uint8)).tobytes() - packed_vectors.append(packed) - - self.add_vectors(ids, packed_vectors) - - def remove_vectors(self, ids: List[int]) -> None: - """Remove vectors from the index. - - Args: - ids: List of vector IDs to remove - - Raises: - StorageError: If index operation fails - - Note: - Optimized for batch deletion using set operations instead of - O(N) list.remove() calls for each ID. - """ - if len(ids) == 0: - return - - with self._lock: - try: - # Use set for O(1) lookup during filtering - ids_to_remove = set(ids) - removed_count = 0 - - # Remove from dictionary - O(1) per deletion - for vec_id in ids_to_remove: - if vec_id in self._vectors: - del self._vectors[vec_id] - removed_count += 1 - - # Rebuild ID list efficiently - O(N) once instead of O(N) per removal - if removed_count > 0: - self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove] - # Invalidate cache on modification - self._cache_valid = False - - logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index") - - if self._auto_save and removed_count > 0: - self.save() - - except Exception as e: - raise StorageError( - f"Failed to remove vectors from Binary ANN index: {e}" - ) - - def _build_cache(self) -> None: - """Build numpy array cache from vectors dict for vectorized search. - - Pre-computes a contiguous numpy array from all vectors for efficient - batch distance computation. Called lazily on first search after modification. - """ - if self._cache_valid: - return - - n_vectors = len(self._id_list) - if n_vectors == 0: - self._vectors_matrix = None - self._ids_array = None - self._cache_valid = True - return - - # Build contiguous numpy array of all packed vectors - # Shape: (n_vectors, packed_dim) with uint8 dtype - self._vectors_matrix = np.empty((n_vectors, self.packed_dim), dtype=np.uint8) - self._ids_array = np.array(self._id_list, dtype=np.int64) - - for i, vec_id in enumerate(self._id_list): - vec_bytes = self._vectors[vec_id] - self._vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8) - - self._cache_valid = True - logger.debug(f"Built vectorized cache for {n_vectors} binary vectors") - - def search( - self, query: bytes, top_k: int = 10 - ) -> Tuple[List[int], List[int]]: - """Search for nearest neighbors using Hamming distance. - - Uses vectorized batch computation for O(N) search with SIMD acceleration. - Pre-computes and caches numpy arrays for efficient repeated queries. - - Args: - query: Packed binary query vector (size: packed_dim bytes) - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) where: - - ids: List of vector IDs ordered by Hamming distance (ascending) - - distances: List of Hamming distances (lower = more similar) - - Raises: - ValueError: If query size is invalid - StorageError: If search operation fails - """ - if len(query) != self.packed_dim: - raise ValueError( - f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})" - ) - - with self._lock: - try: - if len(self._vectors) == 0: - return [], [] - - # Build cache if needed (lazy initialization) - self._build_cache() - - if self._vectors_matrix is None or self._ids_array is None: - return [], [] - - # Vectorized Hamming distance computation - # 1. Convert query to numpy array - query_arr = np.frombuffer(query, dtype=np.uint8) - - # 2. Broadcast XOR: (1, packed_dim) XOR (n_vectors, packed_dim) - # Result shape: (n_vectors, packed_dim) - xor_result = np.bitwise_xor(query_arr, self._vectors_matrix) - - # 3. Vectorized popcount using lookup table for efficiency - # np.unpackbits is slow for large arrays, use popcount LUT instead - popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) - bit_counts = popcount_lut[xor_result] - - # 4. Sum across packed bytes to get Hamming distance per vector - distances = bit_counts.sum(axis=1) - - # 5. Get top-k using argpartition (O(N) instead of O(N log N) for full sort) - n_vectors = len(distances) - k = min(top_k, n_vectors) - - if k == n_vectors: - # No partitioning needed, just sort all - sorted_indices = np.argsort(distances) - else: - # Use argpartition for O(N) partial sort - partition_indices = np.argpartition(distances, k)[:k] - # Sort only the top-k - top_k_distances = distances[partition_indices] - sorted_order = np.argsort(top_k_distances) - sorted_indices = partition_indices[sorted_order] - - # 6. Return results - result_ids = self._ids_array[sorted_indices].tolist() - result_dists = distances[sorted_indices].tolist() - - return result_ids, result_dists - - except Exception as e: - raise StorageError(f"Failed to search Binary ANN index: {e}") - - def search_numpy( - self, query: np.ndarray, top_k: int = 10 - ) -> Tuple[List[int], List[int]]: - """Search with unpacked binary query vector. - - Convenience method that packs the query before searching. - - Args: - query: Binary query vector of shape (dim,) with values 0 or 1 - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) - """ - if query.ndim == 2: - query = query.flatten() - - if len(query) != self.dim: - raise ValueError( - f"Query dimension ({len(query)}) must match index dimension ({self.dim})" - ) - - packed_query = np.packbits(query.astype(np.uint8)).tobytes() - return self.search(packed_query, top_k) - - def search_batch( - self, queries: List[bytes], top_k: int = 10 - ) -> List[Tuple[List[int], List[int]]]: - """Batch search for multiple queries. - - Args: - queries: List of packed binary query vectors - top_k: Number of nearest neighbors to return per query - - Returns: - List of (ids, distances) tuples, one per query - """ - results = [] - for query in queries: - ids, dists = self.search(query, top_k) - results.append((ids, dists)) - return results - - def save(self) -> None: - """Save index to disk. - - Binary format: - - 4 bytes: magic number (0x42494E56 = "BINV") - - 4 bytes: version (1) - - 4 bytes: dim - - 4 bytes: packed_dim - - 4 bytes: num_vectors - - For each vector: - - 4 bytes: id - - packed_dim bytes: vector data - - Raises: - StorageError: If save operation fails - """ - with self._lock: - try: - if len(self._vectors) == 0: - logger.debug("Skipping save: index is empty") - return - - # Ensure parent directory exists - self.binary_path.parent.mkdir(parents=True, exist_ok=True) - - with open(self.binary_path, "wb") as f: - # Header - f.write(b"BINV") # Magic number - f.write(np.array([1], dtype=np.uint32).tobytes()) # Version - f.write(np.array([self.dim], dtype=np.uint32).tobytes()) - f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes()) - f.write( - np.array([len(self._vectors)], dtype=np.uint32).tobytes() - ) - - # Vectors - for vec_id in self._id_list: - f.write(np.array([vec_id], dtype=np.uint32).tobytes()) - f.write(self._vectors[vec_id]) - - logger.debug( - f"Saved binary index to {self.binary_path} " - f"({len(self._vectors)} vectors)" - ) - - except Exception as e: - raise StorageError(f"Failed to save Binary ANN index: {e}") - - def load(self) -> bool: - """Load index from disk. - - Returns: - True if index was loaded successfully, False if index file doesn't exist - - Raises: - StorageError: If load operation fails - """ - with self._lock: - try: - if not self.binary_path.exists(): - logger.debug(f"Binary index file not found: {self.binary_path}") - return False - - with open(self.binary_path, "rb") as f: - # Read header - magic = f.read(4) - if magic != b"BINV": - raise StorageError( - f"Invalid binary index file: bad magic number" - ) - - version = np.frombuffer(f.read(4), dtype=np.uint32)[0] - if version != 1: - raise StorageError( - f"Unsupported binary index version: {version}" - ) - - file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0] - file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0] - num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0] - - if file_dim != self.dim or file_packed_dim != self.packed_dim: - raise StorageError( - f"Dimension mismatch: file has dim={file_dim}, " - f"packed_dim={file_packed_dim}, " - f"expected dim={self.dim}, packed_dim={self.packed_dim}" - ) - - # Clear existing data - self._vectors.clear() - self._id_list.clear() - self._cache_valid = False - - # Read vectors - for _ in range(num_vectors): - vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0] - vec_data = f.read(self.packed_dim) - self._vectors[int(vec_id)] = vec_data - self._id_list.append(int(vec_id)) - - logger.info( - f"Loaded binary index from {self.binary_path} " - f"({len(self._vectors)} vectors)" - ) - - return True - - except StorageError: - raise - except Exception as e: - raise StorageError(f"Failed to load Binary ANN index: {e}") - - def count(self) -> int: - """Get number of vectors in the index. - - Returns: - Number of vectors currently in the index - """ - with self._lock: - return len(self._vectors) - - @property - def is_loaded(self) -> bool: - """Check if index has vectors. - - Returns: - True if index has vectors, False otherwise - """ - with self._lock: - return len(self._vectors) > 0 - - def get_vector(self, vec_id: int) -> Optional[bytes]: - """Get a specific vector by ID. - - Args: - vec_id: Vector ID to retrieve - - Returns: - Packed binary vector or None if not found - """ - with self._lock: - return self._vectors.get(vec_id) - - def clear(self) -> None: - """Clear all vectors from the index.""" - with self._lock: - self._vectors.clear() - self._id_list.clear() - self._vectors_matrix = None - self._ids_array = None - self._cache_valid = False - logger.debug("Cleared binary index") - - -def create_ann_index( - index_path: Path, - index_type: str = "hnsw", - dim: int = 2048, - **kwargs, -) -> ANNIndex | BinaryANNIndex: - """Factory function to create an ANN index. - - Args: - index_path: Path to database file - index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors - dim: Vector dimension (default: 2048 for dense, 256 for binary) - **kwargs: Additional arguments passed to the index constructor - - Returns: - ANNIndex for dense vectors or BinaryANNIndex for binary vectors - - Raises: - ValueError: If index_type is invalid - - Example: - >>> # Dense vector index (HNSW) - >>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048) - >>> dense_index.add_vectors(ids, dense_vectors) - >>> - >>> # Binary vector index (Hamming distance) - >>> binary_index = create_ann_index(path, index_type="binary", dim=256) - >>> binary_index.add_vectors(ids, packed_vectors) - """ - index_type = index_type.lower() - - if index_type == "hnsw": - return ANNIndex(index_path=index_path, dim=dim, **kwargs) - elif index_type == "binary": - # Default to 256 for binary if not specified - if dim == 2048: # Default dense dim was used - dim = 256 - return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs) - else: - raise ValueError( - f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'." - ) diff --git a/codex-lens/build/lib/codexlens/semantic/base.py b/codex-lens/build/lib/codexlens/semantic/base.py deleted file mode 100644 index bf8109a0..00000000 --- a/codex-lens/build/lib/codexlens/semantic/base.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Base class for embedders. - -Defines the interface that all embedders must implement. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Iterable - -import numpy as np - - -class BaseEmbedder(ABC): - """Base class for all embedders. - - All embedder implementations must inherit from this class and implement - the abstract methods to ensure a consistent interface. - """ - - @property - @abstractmethod - def embedding_dim(self) -> int: - """Return embedding dimensions. - - Returns: - int: Dimension of the embedding vectors. - """ - ... - - @property - @abstractmethod - def model_name(self) -> str: - """Return model name. - - Returns: - str: Name or identifier of the underlying model. - """ - ... - - @property - def max_tokens(self) -> int: - """Return maximum token limit for embeddings. - - Returns: - int: Maximum number of tokens that can be embedded at once. - Default is 8192 if not overridden by implementation. - """ - return 8192 - - @abstractmethod - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Embed texts to numpy array. - - Args: - texts: Single text or iterable of texts to embed. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - """ - ... diff --git a/codex-lens/build/lib/codexlens/semantic/chunker.py b/codex-lens/build/lib/codexlens/semantic/chunker.py deleted file mode 100644 index 05d3eb50..00000000 --- a/codex-lens/build/lib/codexlens/semantic/chunker.py +++ /dev/null @@ -1,821 +0,0 @@ -"""Code chunking strategies for semantic search. - -This module provides various chunking strategies for breaking down source code -into semantic chunks suitable for embedding and search. - -Lightweight Mode: - The ChunkConfig supports a `skip_token_count` option for performance optimization. - When enabled, token counting uses a fast character-based estimation (char/4) - instead of expensive tiktoken encoding. - - Use cases for lightweight mode: - - Large-scale indexing where speed is critical - - Scenarios where approximate token counts are acceptable - - Memory-constrained environments - - Initial prototyping and development - - Example: - # Default mode (accurate tiktoken encoding) - config = ChunkConfig() - chunker = Chunker(config) - - # Lightweight mode (fast char/4 estimation) - config = ChunkConfig(skip_token_count=True) - chunker = Chunker(config) - chunks = chunker.chunk_file(content, symbols, path, language) -""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.parsers.tokenizer import get_default_tokenizer - - -@dataclass -class ChunkConfig: - """Configuration for chunking strategies.""" - max_chunk_size: int = 1000 # Max characters per chunk - overlap: int = 200 # Overlap for sliding window (increased from 100 for better context) - strategy: str = "auto" # Chunking strategy: auto, symbol, sliding_window, hybrid - min_chunk_size: int = 50 # Minimum chunk size - skip_token_count: bool = False # Skip expensive token counting (use char/4 estimate) - strip_comments: bool = True # Remove comments from chunk content for embedding - strip_docstrings: bool = True # Remove docstrings from chunk content for embedding - preserve_original: bool = True # Store original content in metadata when stripping - - -class CommentStripper: - """Remove comments from source code while preserving structure.""" - - @staticmethod - def strip_python_comments(content: str) -> str: - """Strip Python comments (# style) but preserve docstrings. - - Args: - content: Python source code - - Returns: - Code with comments removed - """ - lines = content.splitlines(keepends=True) - result_lines: List[str] = [] - in_string = False - string_char = None - - for line in lines: - new_line = [] - i = 0 - while i < len(line): - char = line[i] - - # Handle string literals - if char in ('"', "'") and not in_string: - # Check for triple quotes - if line[i:i+3] in ('"""', "'''"): - in_string = True - string_char = line[i:i+3] - new_line.append(line[i:i+3]) - i += 3 - continue - else: - in_string = True - string_char = char - elif in_string: - if string_char and len(string_char) == 3: - if line[i:i+3] == string_char: - in_string = False - new_line.append(line[i:i+3]) - i += 3 - string_char = None - continue - elif char == string_char: - # Check for escape - if i > 0 and line[i-1] != '\\': - in_string = False - string_char = None - - # Handle comments (only outside strings) - if char == '#' and not in_string: - # Rest of line is comment, skip it - new_line.append('\n' if line.endswith('\n') else '') - break - - new_line.append(char) - i += 1 - - result_lines.append(''.join(new_line)) - - return ''.join(result_lines) - - @staticmethod - def strip_c_style_comments(content: str) -> str: - """Strip C-style comments (// and /* */) from code. - - Args: - content: Source code with C-style comments - - Returns: - Code with comments removed - """ - result = [] - i = 0 - in_string = False - string_char = None - in_multiline_comment = False - - while i < len(content): - # Handle multi-line comment end - if in_multiline_comment: - if content[i:i+2] == '*/': - in_multiline_comment = False - i += 2 - continue - i += 1 - continue - - char = content[i] - - # Handle string literals - if char in ('"', "'", '`') and not in_string: - in_string = True - string_char = char - result.append(char) - i += 1 - continue - elif in_string: - result.append(char) - if char == string_char and (i == 0 or content[i-1] != '\\'): - in_string = False - string_char = None - i += 1 - continue - - # Handle comments - if content[i:i+2] == '//': - # Single line comment - skip to end of line - while i < len(content) and content[i] != '\n': - i += 1 - if i < len(content): - result.append('\n') - i += 1 - continue - - if content[i:i+2] == '/*': - in_multiline_comment = True - i += 2 - continue - - result.append(char) - i += 1 - - return ''.join(result) - - @classmethod - def strip_comments(cls, content: str, language: str) -> str: - """Strip comments based on language. - - Args: - content: Source code content - language: Programming language - - Returns: - Code with comments removed - """ - if language == "python": - return cls.strip_python_comments(content) - elif language in {"javascript", "typescript", "java", "c", "cpp", "go", "rust"}: - return cls.strip_c_style_comments(content) - return content - - -class DocstringStripper: - """Remove docstrings from source code.""" - - @staticmethod - def strip_python_docstrings(content: str) -> str: - """Strip Python docstrings (triple-quoted strings at module/class/function level). - - Args: - content: Python source code - - Returns: - Code with docstrings removed - """ - lines = content.splitlines(keepends=True) - result_lines: List[str] = [] - i = 0 - - while i < len(lines): - line = lines[i] - stripped = line.strip() - - # Check for docstring start - if stripped.startswith('"""') or stripped.startswith("'''"): - quote_type = '"""' if stripped.startswith('"""') else "'''" - - # Single line docstring - if stripped.count(quote_type) >= 2: - # Skip this line (docstring) - i += 1 - continue - - # Multi-line docstring - skip until closing - i += 1 - while i < len(lines): - if quote_type in lines[i]: - i += 1 - break - i += 1 - continue - - result_lines.append(line) - i += 1 - - return ''.join(result_lines) - - @staticmethod - def strip_jsdoc_comments(content: str) -> str: - """Strip JSDoc comments (/** ... */) from code. - - Args: - content: JavaScript/TypeScript source code - - Returns: - Code with JSDoc comments removed - """ - result = [] - i = 0 - in_jsdoc = False - - while i < len(content): - if in_jsdoc: - if content[i:i+2] == '*/': - in_jsdoc = False - i += 2 - continue - i += 1 - continue - - # Check for JSDoc start (/** but not /*) - if content[i:i+3] == '/**': - in_jsdoc = True - i += 3 - continue - - result.append(content[i]) - i += 1 - - return ''.join(result) - - @classmethod - def strip_docstrings(cls, content: str, language: str) -> str: - """Strip docstrings based on language. - - Args: - content: Source code content - language: Programming language - - Returns: - Code with docstrings removed - """ - if language == "python": - return cls.strip_python_docstrings(content) - elif language in {"javascript", "typescript"}: - return cls.strip_jsdoc_comments(content) - return content - - -class Chunker: - """Chunk code files for semantic embedding.""" - - def __init__(self, config: ChunkConfig | None = None) -> None: - self.config = config or ChunkConfig() - self._tokenizer = get_default_tokenizer() - self._comment_stripper = CommentStripper() - self._docstring_stripper = DocstringStripper() - - def _process_content(self, content: str, language: str) -> Tuple[str, Optional[str]]: - """Process chunk content by stripping comments/docstrings if configured. - - Args: - content: Original chunk content - language: Programming language - - Returns: - Tuple of (processed_content, original_content_if_preserved) - """ - original = content if self.config.preserve_original else None - processed = content - - if self.config.strip_comments: - processed = self._comment_stripper.strip_comments(processed, language) - - if self.config.strip_docstrings: - processed = self._docstring_stripper.strip_docstrings(processed, language) - - # If nothing changed, don't store original - if processed == content: - original = None - - return processed, original - - def _estimate_token_count(self, text: str) -> int: - """Estimate token count based on config. - - If skip_token_count is True, uses character-based estimation (char/4). - Otherwise, uses accurate tiktoken encoding. - - Args: - text: Text to count tokens for - - Returns: - Estimated token count - """ - if self.config.skip_token_count: - # Fast character-based estimation: ~4 chars per token - return max(1, len(text) // 4) - return self._tokenizer.count_tokens(text) - - def chunk_by_symbol( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk code by extracted symbols (functions, classes). - - Each symbol becomes one chunk with its full content. - Large symbols exceeding max_chunk_size are recursively split using sliding window. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - chunks: List[SemanticChunk] = [] - lines = content.splitlines(keepends=True) - - for symbol in symbols: - start_line, end_line = symbol.range - # Convert to 0-indexed - start_idx = max(0, start_line - 1) - end_idx = min(len(lines), end_line) - - chunk_content = "".join(lines[start_idx:end_idx]) - if len(chunk_content.strip()) < self.config.min_chunk_size: - continue - - # Check if symbol content exceeds max_chunk_size - if len(chunk_content) > self.config.max_chunk_size: - # Create line mapping for correct line number tracking - line_mapping = list(range(start_line, end_line + 1)) - - # Use sliding window to split large symbol - sub_chunks = self.chunk_sliding_window( - chunk_content, - file_path=file_path, - language=language, - line_mapping=line_mapping - ) - - # Update sub_chunks with parent symbol metadata - for sub_chunk in sub_chunks: - sub_chunk.metadata["symbol_name"] = symbol.name - sub_chunk.metadata["symbol_kind"] = symbol.kind - sub_chunk.metadata["strategy"] = "symbol_split" - sub_chunk.metadata["chunk_type"] = "code" - sub_chunk.metadata["parent_symbol_range"] = (start_line, end_line) - - chunks.extend(sub_chunks) - else: - # Process content (strip comments/docstrings if configured) - processed_content, original_content = self._process_content(chunk_content, language) - - # Skip if processed content is too small - if len(processed_content.strip()) < self.config.min_chunk_size: - continue - - # Calculate token count if not provided - token_count = None - if symbol_token_counts and symbol.name in symbol_token_counts: - token_count = symbol_token_counts[symbol.name] - else: - token_count = self._estimate_token_count(processed_content) - - metadata = { - "file": str(file_path), - "language": language, - "symbol_name": symbol.name, - "symbol_kind": symbol.kind, - "start_line": start_line, - "end_line": end_line, - "strategy": "symbol", - "chunk_type": "code", - "token_count": token_count, - } - - # Store original content if it was modified - if original_content is not None: - metadata["original_content"] = original_content - - chunks.append(SemanticChunk( - content=processed_content, - embedding=None, - metadata=metadata - )) - - return chunks - - def chunk_sliding_window( - self, - content: str, - file_path: str | Path, - language: str, - line_mapping: Optional[List[int]] = None, - ) -> List[SemanticChunk]: - """Chunk code using sliding window approach. - - Used for files without clear symbol boundaries or very long functions. - - Args: - content: Source code content - file_path: Path to source file - language: Programming language - line_mapping: Optional list mapping content line indices to original line numbers - (1-indexed). If provided, line_mapping[i] is the original line number - for the i-th line in content. - """ - chunks: List[SemanticChunk] = [] - lines = content.splitlines(keepends=True) - - if not lines: - return chunks - - # Calculate lines per chunk based on average line length - avg_line_len = len(content) / max(len(lines), 1) - lines_per_chunk = max(10, int(self.config.max_chunk_size / max(avg_line_len, 1))) - overlap_lines = max(2, int(self.config.overlap / max(avg_line_len, 1))) - # Ensure overlap is less than chunk size to prevent infinite loop - overlap_lines = min(overlap_lines, lines_per_chunk - 1) - - start = 0 - chunk_idx = 0 - - while start < len(lines): - end = min(start + lines_per_chunk, len(lines)) - chunk_content = "".join(lines[start:end]) - - if len(chunk_content.strip()) >= self.config.min_chunk_size: - # Process content (strip comments/docstrings if configured) - processed_content, original_content = self._process_content(chunk_content, language) - - # Skip if processed content is too small - if len(processed_content.strip()) < self.config.min_chunk_size: - # Move window forward - step = lines_per_chunk - overlap_lines - if step <= 0: - step = 1 - start += step - continue - - token_count = self._estimate_token_count(processed_content) - - # Calculate correct line numbers - if line_mapping: - # Use line mapping to get original line numbers - start_line = line_mapping[start] - end_line = line_mapping[end - 1] - else: - # Default behavior: treat content as starting at line 1 - start_line = start + 1 - end_line = end - - metadata = { - "file": str(file_path), - "language": language, - "chunk_index": chunk_idx, - "start_line": start_line, - "end_line": end_line, - "strategy": "sliding_window", - "chunk_type": "code", - "token_count": token_count, - } - - # Store original content if it was modified - if original_content is not None: - metadata["original_content"] = original_content - - chunks.append(SemanticChunk( - content=processed_content, - embedding=None, - metadata=metadata - )) - chunk_idx += 1 - - # Move window, accounting for overlap - step = lines_per_chunk - overlap_lines - if step <= 0: - step = 1 # Failsafe to prevent infinite loop - start += step - - # Break if we've reached the end - if end >= len(lines): - break - - return chunks - - def chunk_file( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk a file using the best strategy. - - Uses symbol-based chunking if symbols available, - falls back to sliding window for files without symbols. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - if symbols: - return self.chunk_by_symbol(content, symbols, file_path, language, symbol_token_counts) - return self.chunk_sliding_window(content, file_path, language) - -class DocstringExtractor: - """Extract docstrings from source code.""" - - @staticmethod - def extract_python_docstrings(content: str) -> List[Tuple[str, int, int]]: - """Extract Python docstrings with their line ranges. - - Returns: List of (docstring_content, start_line, end_line) tuples - """ - docstrings: List[Tuple[str, int, int]] = [] - lines = content.splitlines(keepends=True) - - i = 0 - while i < len(lines): - line = lines[i] - stripped = line.strip() - if stripped.startswith('"""') or stripped.startswith("'''"): - quote_type = '"""' if stripped.startswith('"""') else "'''" - start_line = i + 1 - - if stripped.count(quote_type) >= 2: - docstring_content = line - end_line = i + 1 - docstrings.append((docstring_content, start_line, end_line)) - i += 1 - continue - - docstring_lines = [line] - i += 1 - while i < len(lines): - docstring_lines.append(lines[i]) - if quote_type in lines[i]: - break - i += 1 - - end_line = i + 1 - docstring_content = "".join(docstring_lines) - docstrings.append((docstring_content, start_line, end_line)) - - i += 1 - - return docstrings - - @staticmethod - def extract_jsdoc_comments(content: str) -> List[Tuple[str, int, int]]: - """Extract JSDoc comments with their line ranges. - - Returns: List of (comment_content, start_line, end_line) tuples - """ - comments: List[Tuple[str, int, int]] = [] - lines = content.splitlines(keepends=True) - - i = 0 - while i < len(lines): - line = lines[i] - stripped = line.strip() - - if stripped.startswith('/**'): - start_line = i + 1 - comment_lines = [line] - i += 1 - - while i < len(lines): - comment_lines.append(lines[i]) - if '*/' in lines[i]: - break - i += 1 - - end_line = i + 1 - comment_content = "".join(comment_lines) - comments.append((comment_content, start_line, end_line)) - - i += 1 - - return comments - - @classmethod - def extract_docstrings( - cls, - content: str, - language: str - ) -> List[Tuple[str, int, int]]: - """Extract docstrings based on language. - - Returns: List of (docstring_content, start_line, end_line) tuples - """ - if language == "python": - return cls.extract_python_docstrings(content) - elif language in {"javascript", "typescript"}: - return cls.extract_jsdoc_comments(content) - return [] - - -class HybridChunker: - """Hybrid chunker that prioritizes docstrings before symbol-based chunking. - - Composition-based strategy that: - 1. Extracts docstrings as dedicated chunks - 2. For remaining code, uses base chunker (symbol or sliding window) - """ - - def __init__( - self, - base_chunker: Chunker | None = None, - config: ChunkConfig | None = None - ) -> None: - """Initialize hybrid chunker. - - Args: - base_chunker: Chunker to use for non-docstring content - config: Configuration for chunking - """ - self.config = config or ChunkConfig() - self.base_chunker = base_chunker or Chunker(self.config) - self.docstring_extractor = DocstringExtractor() - - def _get_excluded_line_ranges( - self, - docstrings: List[Tuple[str, int, int]] - ) -> set[int]: - """Get set of line numbers that are part of docstrings.""" - excluded_lines: set[int] = set() - for _, start_line, end_line in docstrings: - for line_num in range(start_line, end_line + 1): - excluded_lines.add(line_num) - return excluded_lines - - def _filter_symbols_outside_docstrings( - self, - symbols: List[Symbol], - excluded_lines: set[int] - ) -> List[Symbol]: - """Filter symbols to exclude those completely within docstrings.""" - filtered: List[Symbol] = [] - for symbol in symbols: - start_line, end_line = symbol.range - symbol_lines = set(range(start_line, end_line + 1)) - if not symbol_lines.issubset(excluded_lines): - filtered.append(symbol) - return filtered - - def _find_parent_symbol( - self, - start_line: int, - end_line: int, - symbols: List[Symbol], - ) -> Optional[Symbol]: - """Find the smallest symbol range that fully contains a docstring span.""" - candidates: List[Symbol] = [] - for symbol in symbols: - sym_start, sym_end = symbol.range - if sym_start <= start_line and end_line <= sym_end: - candidates.append(symbol) - if not candidates: - return None - return min(candidates, key=lambda s: (s.range[1] - s.range[0], s.range[0])) - - def chunk_file( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk file using hybrid strategy. - - Extracts docstrings first, then chunks remaining code. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - chunks: List[SemanticChunk] = [] - - # Step 1: Extract docstrings as dedicated chunks - docstrings: List[Tuple[str, int, int]] = [] - if language == "python": - # Fast path: avoid expensive docstring extraction if delimiters are absent. - if '"""' in content or "'''" in content: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - elif language in {"javascript", "typescript"}: - if "/**" in content: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - else: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - - # Fast path: no docstrings -> delegate to base chunker directly. - if not docstrings: - if symbols: - base_chunks = self.base_chunker.chunk_by_symbol( - content, symbols, file_path, language, symbol_token_counts - ) - else: - base_chunks = self.base_chunker.chunk_sliding_window(content, file_path, language) - - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - return base_chunks - - for docstring_content, start_line, end_line in docstrings: - if len(docstring_content.strip()) >= self.config.min_chunk_size: - parent_symbol = self._find_parent_symbol(start_line, end_line, symbols) - # Use base chunker's token estimation method - token_count = self.base_chunker._estimate_token_count(docstring_content) - metadata = { - "file": str(file_path), - "language": language, - "chunk_type": "docstring", - "start_line": start_line, - "end_line": end_line, - "strategy": "hybrid", - "token_count": token_count, - } - if parent_symbol is not None: - metadata["parent_symbol"] = parent_symbol.name - metadata["parent_symbol_kind"] = parent_symbol.kind - metadata["parent_symbol_range"] = parent_symbol.range - chunks.append(SemanticChunk( - content=docstring_content, - embedding=None, - metadata=metadata - )) - - # Step 2: Get line ranges occupied by docstrings - excluded_lines = self._get_excluded_line_ranges(docstrings) - - # Step 3: Filter symbols to exclude docstring-only ranges - filtered_symbols = self._filter_symbols_outside_docstrings(symbols, excluded_lines) - - # Step 4: Chunk remaining content using base chunker - if filtered_symbols: - base_chunks = self.base_chunker.chunk_by_symbol( - content, filtered_symbols, file_path, language, symbol_token_counts - ) - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - chunks.append(chunk) - else: - lines = content.splitlines(keepends=True) - remaining_lines: List[str] = [] - - for i, line in enumerate(lines, start=1): - if i not in excluded_lines: - remaining_lines.append(line) - - if remaining_lines: - remaining_content = "".join(remaining_lines) - if len(remaining_content.strip()) >= self.config.min_chunk_size: - base_chunks = self.base_chunker.chunk_sliding_window( - remaining_content, file_path, language - ) - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - chunks.append(chunk) - - return chunks diff --git a/codex-lens/build/lib/codexlens/semantic/code_extractor.py b/codex-lens/build/lib/codexlens/semantic/code_extractor.py deleted file mode 100644 index ec5b7211..00000000 --- a/codex-lens/build/lib/codexlens/semantic/code_extractor.py +++ /dev/null @@ -1,274 +0,0 @@ -"""Smart code extraction for complete code blocks.""" - -from __future__ import annotations - -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import SearchResult, Symbol - - -def extract_complete_code_block( - result: SearchResult, - source_file_path: Optional[str] = None, - context_lines: int = 0, -) -> str: - """Extract complete code block from a search result. - - Args: - result: SearchResult from semantic search. - source_file_path: Optional path to source file for re-reading. - context_lines: Additional lines of context to include above/below. - - Returns: - Complete code block as string. - """ - # If we have full content stored, use it - if result.content: - if context_lines == 0: - return result.content - # Need to add context, read from file - - # Try to read from source file - file_path = source_file_path or result.path - if not file_path or not Path(file_path).exists(): - # Fall back to excerpt - return result.excerpt or "" - - try: - content = Path(file_path).read_text(encoding="utf-8", errors="ignore") - lines = content.splitlines() - - # Get line range - start_line = result.start_line or 1 - end_line = result.end_line or len(lines) - - # Add context - start_idx = max(0, start_line - 1 - context_lines) - end_idx = min(len(lines), end_line + context_lines) - - return "\n".join(lines[start_idx:end_idx]) - except Exception: - return result.excerpt or result.content or "" - - -def extract_symbol_with_context( - file_path: str, - symbol: Symbol, - include_docstring: bool = True, - include_decorators: bool = True, -) -> str: - """Extract a symbol (function/class) with its docstring and decorators. - - Args: - file_path: Path to source file. - symbol: Symbol to extract. - include_docstring: Include docstring if present. - include_decorators: Include decorators/annotations above symbol. - - Returns: - Complete symbol code with context. - """ - try: - content = Path(file_path).read_text(encoding="utf-8", errors="ignore") - lines = content.splitlines() - - start_line, end_line = symbol.range - start_idx = start_line - 1 - end_idx = end_line - - # Look for decorators above the symbol - if include_decorators and start_idx > 0: - decorator_start = start_idx - # Search backwards for decorators - i = start_idx - 1 - while i >= 0 and i >= start_idx - 20: # Look up to 20 lines back - line = lines[i].strip() - if line.startswith("@"): - decorator_start = i - i -= 1 - elif line == "" or line.startswith("#"): - # Skip empty lines and comments, continue looking - i -= 1 - elif line.startswith("//") or line.startswith("/*") or line.startswith("*"): - # JavaScript/Java style comments - decorator_start = i - i -= 1 - else: - # Found non-decorator, non-comment line, stop - break - start_idx = decorator_start - - return "\n".join(lines[start_idx:end_idx]) - except Exception: - return "" - - -def format_search_result_code( - result: SearchResult, - max_lines: Optional[int] = None, - show_line_numbers: bool = True, - highlight_match: bool = False, -) -> str: - """Format search result code for display. - - Args: - result: SearchResult to format. - max_lines: Maximum lines to show (None for all). - show_line_numbers: Include line numbers in output. - highlight_match: Add markers for matched region. - - Returns: - Formatted code string. - """ - content = result.content or result.excerpt or "" - if not content: - return "" - - lines = content.splitlines() - - # Truncate if needed - truncated = False - if max_lines and len(lines) > max_lines: - lines = lines[:max_lines] - truncated = True - - # Format with line numbers - if show_line_numbers: - start = result.start_line or 1 - formatted_lines = [] - for i, line in enumerate(lines): - line_num = start + i - formatted_lines.append(f"{line_num:4d} | {line}") - output = "\n".join(formatted_lines) - else: - output = "\n".join(lines) - - if truncated: - output += "\n... (truncated)" - - return output - - -def get_code_block_summary(result: SearchResult) -> str: - """Get a concise summary of a code block. - - Args: - result: SearchResult to summarize. - - Returns: - Summary string like "function hello_world (lines 10-25)" - """ - parts = [] - - if result.symbol_kind: - parts.append(result.symbol_kind) - - if result.symbol_name: - parts.append(f"`{result.symbol_name}`") - elif result.excerpt: - # Extract first meaningful identifier - first_line = result.excerpt.split("\n")[0][:50] - parts.append(f'"{first_line}..."') - - if result.start_line and result.end_line: - if result.start_line == result.end_line: - parts.append(f"(line {result.start_line})") - else: - parts.append(f"(lines {result.start_line}-{result.end_line})") - - if result.path: - file_name = Path(result.path).name - parts.append(f"in {file_name}") - - return " ".join(parts) if parts else "unknown code block" - - -class CodeBlockResult: - """Enhanced search result with complete code block.""" - - def __init__(self, result: SearchResult, source_path: Optional[str] = None): - self.result = result - self.source_path = source_path or result.path - self._full_code: Optional[str] = None - - @property - def score(self) -> float: - return self.result.score - - @property - def path(self) -> str: - return self.result.path - - @property - def file_name(self) -> str: - return Path(self.result.path).name - - @property - def symbol_name(self) -> Optional[str]: - return self.result.symbol_name - - @property - def symbol_kind(self) -> Optional[str]: - return self.result.symbol_kind - - @property - def line_range(self) -> Tuple[int, int]: - return ( - self.result.start_line or 1, - self.result.end_line or 1 - ) - - @property - def full_code(self) -> str: - """Get full code block content.""" - if self._full_code is None: - self._full_code = extract_complete_code_block(self.result, self.source_path) - return self._full_code - - @property - def excerpt(self) -> str: - """Get short excerpt.""" - return self.result.excerpt or "" - - @property - def summary(self) -> str: - """Get code block summary.""" - return get_code_block_summary(self.result) - - def format( - self, - max_lines: Optional[int] = None, - show_line_numbers: bool = True, - ) -> str: - """Format code for display.""" - # Use full code if available - display_result = SearchResult( - path=self.result.path, - score=self.result.score, - content=self.full_code, - start_line=self.result.start_line, - end_line=self.result.end_line, - ) - return format_search_result_code( - display_result, - max_lines=max_lines, - show_line_numbers=show_line_numbers - ) - - def __repr__(self) -> str: - return f"" - - -def enhance_search_results( - results: List[SearchResult], -) -> List[CodeBlockResult]: - """Enhance search results with complete code block access. - - Args: - results: List of SearchResult from semantic search. - - Returns: - List of CodeBlockResult with full code access. - """ - return [CodeBlockResult(r) for r in results] diff --git a/codex-lens/build/lib/codexlens/semantic/embedder.py b/codex-lens/build/lib/codexlens/semantic/embedder.py deleted file mode 100644 index e2d21717..00000000 --- a/codex-lens/build/lib/codexlens/semantic/embedder.py +++ /dev/null @@ -1,288 +0,0 @@ -"""Embedder for semantic code search using fastembed. - -Supports GPU acceleration via ONNX execution providers (CUDA, TensorRT, DirectML, ROCm, CoreML). -GPU acceleration is automatic when available, with transparent CPU fallback. -""" - -from __future__ import annotations - -import gc -import logging -import threading -from typing import Dict, Iterable, List, Optional - -import numpy as np - -from . import SEMANTIC_AVAILABLE -from .base import BaseEmbedder -from .gpu_support import get_optimal_providers, is_gpu_available, get_gpu_summary, get_selected_device_id - -logger = logging.getLogger(__name__) - -# Global embedder cache for singleton pattern -_embedder_cache: Dict[str, "Embedder"] = {} -_cache_lock = threading.RLock() - - -def get_embedder(profile: str = "code", use_gpu: bool = True) -> "Embedder": - """Get or create a cached Embedder instance (thread-safe singleton). - - This function provides significant performance improvement by reusing - Embedder instances across multiple searches, avoiding repeated model - loading overhead (~0.8s per load). - - Args: - profile: Model profile ("fast", "code", "multilingual", "balanced") - use_gpu: If True, use GPU acceleration when available (default: True) - - Returns: - Cached Embedder instance for the given profile - """ - global _embedder_cache - - # Cache key includes GPU preference to support mixed configurations - cache_key = f"{profile}:{'gpu' if use_gpu else 'cpu'}" - - # All cache access is protected by _cache_lock to avoid races with - # clear_embedder_cache() during concurrent access. - with _cache_lock: - embedder = _embedder_cache.get(cache_key) - if embedder is not None: - return embedder - - # Create new embedder and cache it - embedder = Embedder(profile=profile, use_gpu=use_gpu) - # Pre-load model to ensure it's ready - embedder._load_model() - _embedder_cache[cache_key] = embedder - - # Log GPU status on first embedder creation - if use_gpu and is_gpu_available(): - logger.info(f"Embedder initialized with GPU: {get_gpu_summary()}") - elif use_gpu: - logger.debug("GPU not available, using CPU for embeddings") - - return embedder - - -def clear_embedder_cache() -> None: - """Clear the embedder cache and release ONNX resources. - - This method ensures proper cleanup of ONNX model resources to prevent - memory leaks when embedders are no longer needed. - """ - global _embedder_cache - with _cache_lock: - # Release ONNX resources before clearing cache - for embedder in _embedder_cache.values(): - if embedder._model is not None: - del embedder._model - embedder._model = None - _embedder_cache.clear() - gc.collect() - - -class Embedder(BaseEmbedder): - """Generate embeddings for code chunks using fastembed (ONNX-based). - - Supported Model Profiles: - - fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized - - code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages - - multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support - - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose - """ - - # Model profiles for different use cases - MODELS = { - "fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight - "code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized - "multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual - "balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy - } - - # Dimension mapping for each model - MODEL_DIMS = { - "BAAI/bge-small-en-v1.5": 384, - "jinaai/jina-embeddings-v2-base-code": 768, - "intfloat/multilingual-e5-large": 1024, - "mixedbread-ai/mxbai-embed-large-v1": 1024, - } - - # Default model (fast profile) - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" - DEFAULT_PROFILE = "fast" - - def __init__( - self, - model_name: str | None = None, - profile: str | None = None, - use_gpu: bool = True, - providers: List[str] | None = None, - ) -> None: - """Initialize embedder with model or profile. - - Args: - model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code") - profile: Model profile shortcut ("fast", "code", "multilingual", "balanced") - If both provided, model_name takes precedence. - use_gpu: If True, use GPU acceleration when available (default: True) - providers: Explicit ONNX providers list (overrides use_gpu if provided) - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - # Resolve model name from profile or use explicit name - if model_name: - self._model_name = model_name - elif profile and profile in self.MODELS: - self._model_name = self.MODELS[profile] - else: - self._model_name = self.DEFAULT_MODEL - - # Configure ONNX execution providers with device_id options for GPU selection - # Using with_device_options=True ensures DirectML/CUDA device_id is passed correctly - if providers is not None: - self._providers = providers - else: - self._providers = get_optimal_providers(use_gpu=use_gpu, with_device_options=True) - - self._use_gpu = use_gpu - self._model = None - - @property - def model_name(self) -> str: - """Get model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Get embedding dimension for current model.""" - return self.MODEL_DIMS.get(self._model_name, 768) # Default to 768 if unknown - - @property - def max_tokens(self) -> int: - """Get maximum token limit for current model. - - Returns: - int: Maximum number of tokens based on model profile. - - fast: 512 (lightweight, optimized for speed) - - code: 8192 (code-optimized, larger context) - - multilingual: 512 (standard multilingual model) - - balanced: 512 (general purpose) - """ - # Determine profile from model name - profile = None - for prof, model in self.MODELS.items(): - if model == self._model_name: - profile = prof - break - - # Return token limit based on profile - if profile == "code": - return 8192 - elif profile in ("fast", "multilingual", "balanced"): - return 512 - else: - # Default for unknown models - return 512 - - @property - def providers(self) -> List[str]: - """Get configured ONNX execution providers.""" - return self._providers - - @property - def is_gpu_enabled(self) -> bool: - """Check if GPU acceleration is enabled for this embedder.""" - gpu_providers = {"CUDAExecutionProvider", "TensorrtExecutionProvider", - "DmlExecutionProvider", "ROCMExecutionProvider", "CoreMLExecutionProvider"} - # Handle both string providers and tuple providers (name, options) - for p in self._providers: - provider_name = p[0] if isinstance(p, tuple) else p - if provider_name in gpu_providers: - return True - return False - - def _load_model(self) -> None: - """Lazy load the embedding model with configured providers.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - - # providers already include device_id options via get_optimal_providers(with_device_options=True) - # DO NOT pass device_ids separately - fastembed ignores it when providers is specified - # See: fastembed/text/onnx_embedding.py - device_ids is only used with cuda=True - try: - self._model = TextEmbedding( - model_name=self.model_name, - providers=self._providers, - ) - logger.debug(f"Model loaded with providers: {self._providers}") - except TypeError: - # Fallback for older fastembed versions without providers parameter - logger.warning( - "fastembed version doesn't support 'providers' parameter. " - "Upgrade fastembed for GPU acceleration: pip install --upgrade fastembed" - ) - self._model = TextEmbedding(model_name=self.model_name) - - def embed(self, texts: str | Iterable[str]) -> List[List[float]]: - """Generate embeddings for one or more texts. - - Args: - texts: Single text or iterable of texts to embed. - - Returns: - List of embedding vectors (each is a list of floats). - - Note: - This method converts numpy arrays to Python lists for backward compatibility. - For memory-efficient processing, use embed_to_numpy() instead. - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - embeddings = list(self._model.embed(texts)) - return [emb.tolist() for emb in embeddings] - - def embed_to_numpy(self, texts: str | Iterable[str], batch_size: Optional[int] = None) -> np.ndarray: - """Generate embeddings for one or more texts (returns numpy arrays). - - This method is more memory-efficient than embed() as it avoids converting - numpy arrays to Python lists, which can significantly reduce memory usage - during batch processing. - - Args: - texts: Single text or iterable of texts to embed. - batch_size: Optional batch size for fastembed processing. - Larger values improve GPU utilization but use more memory. - - Returns: - numpy.ndarray of shape (n_texts, embedding_dim) containing embeddings. - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Pass batch_size to fastembed for optimal GPU utilization - # Default batch_size in fastembed is 256, but larger values can improve throughput - if batch_size is not None: - embeddings = list(self._model.embed(texts, batch_size=batch_size)) - else: - embeddings = list(self._model.embed(texts)) - return np.array(embeddings) - - def embed_single(self, text: str) -> List[float]: - """Generate embedding for a single text.""" - return self.embed(text)[0] diff --git a/codex-lens/build/lib/codexlens/semantic/factory.py b/codex-lens/build/lib/codexlens/semantic/factory.py deleted file mode 100644 index 3295eba8..00000000 --- a/codex-lens/build/lib/codexlens/semantic/factory.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Factory for creating embedders. - -Provides a unified interface for instantiating different embedder backends. -Includes caching to avoid repeated model loading overhead. -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Dict, List, Optional - -from .base import BaseEmbedder - -# Module-level cache for embedder instances -# Key: (backend, profile, model, use_gpu) -> embedder instance -_embedder_cache: Dict[tuple, BaseEmbedder] = {} -_cache_lock = threading.Lock() -_logger = logging.getLogger(__name__) - - -def get_embedder( - backend: str = "fastembed", - profile: str = "code", - model: str = "default", - use_gpu: bool = True, - endpoints: Optional[List[Dict[str, Any]]] = None, - strategy: str = "latency_aware", - cooldown: float = 60.0, - **kwargs: Any, -) -> BaseEmbedder: - """Factory function to create embedder based on backend. - - Args: - backend: Embedder backend to use. Options: - - "fastembed": Use fastembed (ONNX-based) embedder (default) - - "litellm": Use ccw-litellm embedder - profile: Model profile for fastembed backend ("fast", "code", "multilingual", "balanced") - Used only when backend="fastembed". Default: "code" - model: Model identifier for litellm backend. - Used only when backend="litellm". Default: "default" - use_gpu: Whether to use GPU acceleration when available (default: True). - Used only when backend="fastembed". - endpoints: Optional list of endpoint configurations for multi-endpoint load balancing. - Each endpoint is a dict with keys: model, api_key, api_base, weight. - Used only when backend="litellm" and multiple endpoints provided. - strategy: Selection strategy for multi-endpoint mode: - "round_robin", "latency_aware", "weighted_random". - Default: "latency_aware" - cooldown: Default cooldown seconds for rate-limited endpoints (default: 60.0) - **kwargs: Additional backend-specific arguments - - Returns: - BaseEmbedder: Configured embedder instance - - Raises: - ValueError: If backend is not recognized - ImportError: If required backend dependencies are not installed - - Examples: - Create fastembed embedder with code profile: - >>> embedder = get_embedder(backend="fastembed", profile="code") - - Create fastembed embedder with fast profile and CPU only: - >>> embedder = get_embedder(backend="fastembed", profile="fast", use_gpu=False) - - Create litellm embedder: - >>> embedder = get_embedder(backend="litellm", model="text-embedding-3-small") - - Create rotational embedder with multiple endpoints: - >>> endpoints = [ - ... {"model": "openai/text-embedding-3-small", "api_key": "sk-..."}, - ... {"model": "azure/my-embedding", "api_base": "https://...", "api_key": "..."}, - ... ] - >>> embedder = get_embedder(backend="litellm", endpoints=endpoints) - """ - # Build cache key from immutable configuration - if backend == "fastembed": - cache_key = ("fastembed", profile, None, use_gpu) - elif backend == "litellm": - # For litellm, use model as part of cache key - # Multi-endpoint mode is not cached as it's more complex - if endpoints and len(endpoints) > 1: - cache_key = None # Skip cache for multi-endpoint - else: - effective_model = endpoints[0]["model"] if endpoints else model - cache_key = ("litellm", None, effective_model, None) - else: - cache_key = None - - # Check cache first (thread-safe) - if cache_key is not None: - with _cache_lock: - if cache_key in _embedder_cache: - _logger.debug("Returning cached embedder for %s", cache_key) - return _embedder_cache[cache_key] - - # Create new embedder instance - embedder: Optional[BaseEmbedder] = None - - if backend == "fastembed": - from .embedder import Embedder - embedder = Embedder(profile=profile, use_gpu=use_gpu, **kwargs) - elif backend == "litellm": - # Check if multi-endpoint mode is requested - if endpoints and len(endpoints) > 1: - from .rotational_embedder import create_rotational_embedder - # Multi-endpoint is not cached - return create_rotational_embedder( - endpoints_config=endpoints, - strategy=strategy, - default_cooldown=cooldown, - ) - elif endpoints and len(endpoints) == 1: - # Single endpoint in list - use it directly - ep = endpoints[0] - ep_kwargs = {**kwargs} - if "api_key" in ep: - ep_kwargs["api_key"] = ep["api_key"] - if "api_base" in ep: - ep_kwargs["api_base"] = ep["api_base"] - from .litellm_embedder import LiteLLMEmbedderWrapper - embedder = LiteLLMEmbedderWrapper(model=ep["model"], **ep_kwargs) - else: - # No endpoints list - use model parameter - from .litellm_embedder import LiteLLMEmbedderWrapper - embedder = LiteLLMEmbedderWrapper(model=model, **kwargs) - else: - raise ValueError( - f"Unknown backend: {backend}. " - f"Supported backends: 'fastembed', 'litellm'" - ) - - # Cache the embedder for future use (thread-safe) - if cache_key is not None and embedder is not None: - with _cache_lock: - # Double-check to avoid race condition - if cache_key not in _embedder_cache: - _embedder_cache[cache_key] = embedder - _logger.debug("Cached new embedder for %s", cache_key) - else: - # Another thread created it already, use that one - embedder = _embedder_cache[cache_key] - - return embedder # type: ignore - - -def clear_embedder_cache() -> int: - """Clear the embedder cache. - - Returns: - Number of embedders cleared from cache - """ - with _cache_lock: - count = len(_embedder_cache) - _embedder_cache.clear() - _logger.debug("Cleared %d embedders from cache", count) - return count diff --git a/codex-lens/build/lib/codexlens/semantic/gpu_support.py b/codex-lens/build/lib/codexlens/semantic/gpu_support.py deleted file mode 100644 index 62a5186d..00000000 --- a/codex-lens/build/lib/codexlens/semantic/gpu_support.py +++ /dev/null @@ -1,431 +0,0 @@ -"""GPU acceleration support for semantic embeddings. - -This module provides GPU detection, initialization, and fallback handling -for ONNX-based embedding generation. -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import List, Optional - -logger = logging.getLogger(__name__) - - -@dataclass -class GPUDevice: - """Individual GPU device info.""" - device_id: int - name: str - is_discrete: bool # True for discrete GPU (NVIDIA, AMD), False for integrated (Intel UHD) - vendor: str # "nvidia", "amd", "intel", "unknown" - - -@dataclass -class GPUInfo: - """GPU availability and configuration info.""" - - gpu_available: bool = False - cuda_available: bool = False - gpu_count: int = 0 - gpu_name: Optional[str] = None - onnx_providers: List[str] = None - devices: List[GPUDevice] = None # List of detected GPU devices - preferred_device_id: Optional[int] = None # Preferred GPU for embedding - - def __post_init__(self): - if self.onnx_providers is None: - self.onnx_providers = ["CPUExecutionProvider"] - if self.devices is None: - self.devices = [] - - -_gpu_info_cache: Optional[GPUInfo] = None - - -def _enumerate_gpus() -> List[GPUDevice]: - """Enumerate available GPU devices using WMI on Windows. - - Returns: - List of GPUDevice with device info, ordered by device_id. - """ - devices = [] - - try: - import subprocess - import sys - - if sys.platform == "win32": - # Use PowerShell to query GPU information via WMI - cmd = [ - "powershell", "-NoProfile", "-Command", - "Get-WmiObject Win32_VideoController | Select-Object DeviceID, Name, AdapterCompatibility | ConvertTo-Json" - ] - result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) - - if result.returncode == 0 and result.stdout.strip(): - import json - gpu_data = json.loads(result.stdout) - - # Handle single GPU case (returns dict instead of list) - if isinstance(gpu_data, dict): - gpu_data = [gpu_data] - - for idx, gpu in enumerate(gpu_data): - name = gpu.get("Name", "Unknown GPU") - compat = gpu.get("AdapterCompatibility", "").lower() - - # Determine vendor - name_lower = name.lower() - if "nvidia" in name_lower or "nvidia" in compat: - vendor = "nvidia" - is_discrete = True - elif "amd" in name_lower or "radeon" in name_lower or "amd" in compat: - vendor = "amd" - is_discrete = True - elif "intel" in name_lower or "intel" in compat: - vendor = "intel" - # Intel UHD/Iris are integrated, Intel Arc is discrete - is_discrete = "arc" in name_lower - else: - vendor = "unknown" - is_discrete = False - - devices.append(GPUDevice( - device_id=idx, - name=name, - is_discrete=is_discrete, - vendor=vendor - )) - logger.debug(f"Detected GPU {idx}: {name} (vendor={vendor}, discrete={is_discrete})") - - except Exception as e: - logger.debug(f"GPU enumeration failed: {e}") - - return devices - - -def _get_preferred_device_id(devices: List[GPUDevice]) -> Optional[int]: - """Determine the preferred GPU device_id for embedding. - - Preference order: - 1. NVIDIA discrete GPU (best DirectML/CUDA support) - 2. AMD discrete GPU - 3. Intel Arc (discrete) - 4. Intel integrated (fallback) - - Returns: - device_id of preferred GPU, or None to use default. - """ - if not devices: - return None - - # Priority: NVIDIA > AMD > Intel Arc > Intel integrated - priority_order = [ - ("nvidia", True), # NVIDIA discrete - ("amd", True), # AMD discrete - ("intel", True), # Intel Arc (discrete) - ("intel", False), # Intel integrated (fallback) - ] - - for target_vendor, target_discrete in priority_order: - for device in devices: - if device.vendor == target_vendor and device.is_discrete == target_discrete: - logger.info(f"Preferred GPU: {device.name} (device_id={device.device_id})") - return device.device_id - - # If no match, use first device - if devices: - return devices[0].device_id - - return None - - -def detect_gpu(force_refresh: bool = False) -> GPUInfo: - """Detect available GPU resources for embedding acceleration. - - Args: - force_refresh: If True, re-detect GPU even if cached. - - Returns: - GPUInfo with detection results. - """ - global _gpu_info_cache - - if _gpu_info_cache is not None and not force_refresh: - return _gpu_info_cache - - info = GPUInfo() - - # Enumerate GPU devices first - info.devices = _enumerate_gpus() - info.gpu_count = len(info.devices) - if info.devices: - # Set preferred device (discrete GPU preferred over integrated) - info.preferred_device_id = _get_preferred_device_id(info.devices) - # Set gpu_name to preferred device name - for dev in info.devices: - if dev.device_id == info.preferred_device_id: - info.gpu_name = dev.name - break - - # Check PyTorch CUDA availability (most reliable detection) - try: - import torch - if torch.cuda.is_available(): - info.cuda_available = True - info.gpu_available = True - info.gpu_count = torch.cuda.device_count() - if info.gpu_count > 0: - info.gpu_name = torch.cuda.get_device_name(0) - logger.debug(f"PyTorch CUDA detected: {info.gpu_count} GPU(s)") - except ImportError: - logger.debug("PyTorch not available for GPU detection") - - # Check ONNX Runtime providers with validation - try: - import onnxruntime as ort - available_providers = ort.get_available_providers() - - # Build provider list with priority order - providers = [] - - # Test each provider to ensure it actually works - def test_provider(provider_name: str) -> bool: - """Test if a provider actually works by creating a dummy session.""" - try: - # Create a minimal ONNX model to test provider - import numpy as np - # Simple test: just check if provider can be instantiated - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 4 # Suppress warnings - return True - except Exception: - return False - - # CUDA provider (NVIDIA GPU) - check if CUDA runtime is available - if "CUDAExecutionProvider" in available_providers: - # Verify CUDA is actually usable by checking for cuBLAS - cuda_works = False - try: - import ctypes - # Try to load cuBLAS to verify CUDA installation - try: - ctypes.CDLL("cublas64_12.dll") - cuda_works = True - except OSError: - try: - ctypes.CDLL("cublas64_11.dll") - cuda_works = True - except OSError: - pass - except Exception: - pass - - if cuda_works: - providers.append("CUDAExecutionProvider") - info.gpu_available = True - logger.debug("ONNX CUDAExecutionProvider available and working") - else: - logger.debug("ONNX CUDAExecutionProvider listed but CUDA runtime not found") - - # TensorRT provider (optimized NVIDIA inference) - if "TensorrtExecutionProvider" in available_providers: - # TensorRT requires additional libraries, skip for now - logger.debug("ONNX TensorrtExecutionProvider available (requires TensorRT SDK)") - - # DirectML provider (Windows GPU - AMD/Intel/NVIDIA) - if "DmlExecutionProvider" in available_providers: - providers.append("DmlExecutionProvider") - info.gpu_available = True - logger.debug("ONNX DmlExecutionProvider available (DirectML)") - - # ROCm provider (AMD GPU on Linux) - if "ROCMExecutionProvider" in available_providers: - providers.append("ROCMExecutionProvider") - info.gpu_available = True - logger.debug("ONNX ROCMExecutionProvider available (AMD)") - - # CoreML provider (Apple Silicon) - if "CoreMLExecutionProvider" in available_providers: - providers.append("CoreMLExecutionProvider") - info.gpu_available = True - logger.debug("ONNX CoreMLExecutionProvider available (Apple)") - - # Always include CPU as fallback - providers.append("CPUExecutionProvider") - - info.onnx_providers = providers - - except ImportError: - logger.debug("ONNX Runtime not available") - info.onnx_providers = ["CPUExecutionProvider"] - - _gpu_info_cache = info - return info - - -def get_optimal_providers(use_gpu: bool = True, with_device_options: bool = False) -> list: - """Get optimal ONNX execution providers based on availability. - - Args: - use_gpu: If True, include GPU providers when available. - If False, force CPU-only execution. - with_device_options: If True, return providers as tuples with device_id options - for proper GPU device selection (required for DirectML). - - Returns: - List of provider names or tuples (provider_name, options_dict) in priority order. - """ - if not use_gpu: - return ["CPUExecutionProvider"] - - gpu_info = detect_gpu() - - # Check if GPU was requested but not available - log warning - if not gpu_info.gpu_available: - try: - import onnxruntime as ort - available_providers = ort.get_available_providers() - except ImportError: - available_providers = [] - logger.warning( - "GPU acceleration was requested, but no supported GPU provider (CUDA, DirectML) " - f"was found. Available providers: {available_providers}. Falling back to CPU." - ) - else: - # Log which GPU provider is being used - gpu_providers = [p for p in gpu_info.onnx_providers if p != "CPUExecutionProvider"] - if gpu_providers: - logger.info(f"Using {gpu_providers[0]} for ONNX GPU acceleration") - - if not with_device_options: - return gpu_info.onnx_providers - - # Build providers with device_id options for GPU providers - device_id = get_selected_device_id() - providers = [] - - for provider in gpu_info.onnx_providers: - if provider == "DmlExecutionProvider" and device_id is not None: - # DirectML requires device_id in provider_options tuple - providers.append(("DmlExecutionProvider", {"device_id": device_id})) - logger.debug(f"DmlExecutionProvider configured with device_id={device_id}") - elif provider == "CUDAExecutionProvider" and device_id is not None: - # CUDA also supports device_id in provider_options - providers.append(("CUDAExecutionProvider", {"device_id": device_id})) - logger.debug(f"CUDAExecutionProvider configured with device_id={device_id}") - elif provider == "ROCMExecutionProvider" and device_id is not None: - # ROCm supports device_id - providers.append(("ROCMExecutionProvider", {"device_id": device_id})) - logger.debug(f"ROCMExecutionProvider configured with device_id={device_id}") - else: - # CPU and other providers don't need device_id - providers.append(provider) - - return providers - - -def is_gpu_available() -> bool: - """Check if any GPU acceleration is available.""" - return detect_gpu().gpu_available - - -def get_gpu_summary() -> str: - """Get human-readable GPU status summary.""" - info = detect_gpu() - - if not info.gpu_available: - return "GPU: Not available (using CPU)" - - parts = [] - if info.gpu_name: - parts.append(f"GPU: {info.gpu_name}") - if info.gpu_count > 1: - parts.append(f"({info.gpu_count} devices)") - - # Show active providers (excluding CPU fallback) - gpu_providers = [p for p in info.onnx_providers if p != "CPUExecutionProvider"] - if gpu_providers: - parts.append(f"Providers: {', '.join(gpu_providers)}") - - return " | ".join(parts) if parts else "GPU: Available" - - -def clear_gpu_cache() -> None: - """Clear cached GPU detection info.""" - global _gpu_info_cache - _gpu_info_cache = None - - -# User-selected device ID (overrides auto-detection) -_selected_device_id: Optional[int] = None - - -def get_gpu_devices() -> List[dict]: - """Get list of available GPU devices for frontend selection. - - Returns: - List of dicts with device info for each GPU. - """ - info = detect_gpu() - devices = [] - - for dev in info.devices: - devices.append({ - "device_id": dev.device_id, - "name": dev.name, - "vendor": dev.vendor, - "is_discrete": dev.is_discrete, - "is_preferred": dev.device_id == info.preferred_device_id, - "is_selected": dev.device_id == get_selected_device_id(), - }) - - return devices - - -def get_selected_device_id() -> Optional[int]: - """Get the user-selected GPU device_id. - - Returns: - User-selected device_id, or auto-detected preferred device_id if not set. - """ - global _selected_device_id - - if _selected_device_id is not None: - return _selected_device_id - - # Fall back to auto-detected preferred device - info = detect_gpu() - return info.preferred_device_id - - -def set_selected_device_id(device_id: Optional[int]) -> bool: - """Set the GPU device_id to use for embeddings. - - Args: - device_id: GPU device_id to use, or None to use auto-detection. - - Returns: - True if device_id is valid, False otherwise. - """ - global _selected_device_id - - if device_id is None: - _selected_device_id = None - logger.info("GPU selection reset to auto-detection") - return True - - # Validate device_id exists - info = detect_gpu() - valid_ids = [dev.device_id for dev in info.devices] - - if device_id in valid_ids: - _selected_device_id = device_id - device_name = next((dev.name for dev in info.devices if dev.device_id == device_id), "Unknown") - logger.info(f"GPU selection set to device {device_id}: {device_name}") - return True - else: - logger.warning(f"Invalid device_id {device_id}. Valid IDs: {valid_ids}") - return False diff --git a/codex-lens/build/lib/codexlens/semantic/litellm_embedder.py b/codex-lens/build/lib/codexlens/semantic/litellm_embedder.py deleted file mode 100644 index ee4284dd..00000000 --- a/codex-lens/build/lib/codexlens/semantic/litellm_embedder.py +++ /dev/null @@ -1,144 +0,0 @@ -"""LiteLLM embedder wrapper for CodexLens. - -Provides integration with ccw-litellm's LiteLLMEmbedder for embedding generation. -""" - -from __future__ import annotations - -from typing import Iterable - -import numpy as np - -from .base import BaseEmbedder - - -class LiteLLMEmbedderWrapper(BaseEmbedder): - """Wrapper for ccw-litellm LiteLLMEmbedder. - - This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens - BaseEmbedder interface, enabling seamless integration with CodexLens - semantic search functionality. - - Args: - model: Model identifier for LiteLLM (default: "default") - **kwargs: Additional arguments passed to LiteLLMEmbedder - - Raises: - ImportError: If ccw-litellm package is not installed - """ - - def __init__(self, model: str = "default", **kwargs) -> None: - """Initialize LiteLLM embedder wrapper. - - Args: - model: Model identifier for LiteLLM (default: "default") - **kwargs: Additional arguments passed to LiteLLMEmbedder - - Raises: - ImportError: If ccw-litellm package is not installed - """ - try: - from ccw_litellm import LiteLLMEmbedder - self._embedder = LiteLLMEmbedder(model=model, **kwargs) - except ImportError as e: - raise ImportError( - "ccw-litellm not installed. Install with: pip install ccw-litellm" - ) from e - - @property - def embedding_dim(self) -> int: - """Return embedding dimensions from LiteLLMEmbedder. - - Returns: - int: Dimension of the embedding vectors. - """ - return self._embedder.dimensions - - @property - def model_name(self) -> str: - """Return model name from LiteLLMEmbedder. - - Returns: - str: Name or identifier of the underlying model. - """ - return self._embedder.model_name - - @property - def max_tokens(self) -> int: - """Return maximum token limit for the embedding model. - - Returns: - int: Maximum number of tokens that can be embedded at once. - Reads from LiteLLM config's max_input_tokens property. - """ - # Get from LiteLLM embedder's max_input_tokens property (now exposed) - if hasattr(self._embedder, 'max_input_tokens'): - return self._embedder.max_input_tokens - - # Fallback: infer from model name - model_name_lower = self.model_name.lower() - - # Large models (8B or "large" in name) - if '8b' in model_name_lower or 'large' in model_name_lower: - return 32768 - - # OpenAI text-embedding-3-* models - if 'text-embedding-3' in model_name_lower: - return 8191 - - # Default fallback - return 8192 - - def _sanitize_text(self, text: str) -> str: - """Sanitize text to work around ModelScope API routing bug. - - ModelScope incorrectly routes text starting with lowercase 'import' - to an Ollama endpoint, causing failures. This adds a leading space - to work around the issue without affecting embedding quality. - - Args: - text: Text to sanitize. - - Returns: - Sanitized text safe for embedding API. - """ - if text.startswith('import'): - return ' ' + text - return text - - def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray: - """Embed texts to numpy array using LiteLLMEmbedder. - - Args: - texts: Single text or iterable of texts to embed. - **kwargs: Additional arguments (ignored for LiteLLM backend). - Accepts batch_size for API compatibility with fastembed. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Sanitize texts to avoid ModelScope routing bug - texts = [self._sanitize_text(t) for t in texts] - - # LiteLLM handles batching internally, ignore batch_size parameter - return self._embedder.embed(texts) - - def embed_single(self, text: str) -> list[float]: - """Generate embedding for a single text. - - Args: - text: Text to embed. - - Returns: - list[float]: Embedding vector as a list of floats. - """ - # Sanitize text before embedding - sanitized = self._sanitize_text(text) - embedding = self._embedder.embed([sanitized]) - return embedding[0].tolist() - diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/__init__.py b/codex-lens/build/lib/codexlens/semantic/reranker/__init__.py deleted file mode 100644 index e52b0223..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Reranker backends for second-stage search ranking. - -This subpackage provides a unified interface and factory for different reranking -implementations (e.g., ONNX, API-based, LiteLLM, and legacy sentence-transformers). -""" - -from __future__ import annotations - -from .base import BaseReranker -from .factory import check_reranker_available, get_reranker -from .fastembed_reranker import FastEmbedReranker, check_fastembed_reranker_available -from .legacy import CrossEncoderReranker, check_cross_encoder_available -from .onnx_reranker import ONNXReranker, check_onnx_reranker_available - -__all__ = [ - "BaseReranker", - "check_reranker_available", - "get_reranker", - "CrossEncoderReranker", - "check_cross_encoder_available", - "FastEmbedReranker", - "check_fastembed_reranker_available", - "ONNXReranker", - "check_onnx_reranker_available", -] diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/api_reranker.py b/codex-lens/build/lib/codexlens/semantic/reranker/api_reranker.py deleted file mode 100644 index 0c255047..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/api_reranker.py +++ /dev/null @@ -1,403 +0,0 @@ -"""API-based reranker using a remote HTTP provider. - -Supported providers: -- SiliconFlow: https://api.siliconflow.cn/v1/rerank -- Cohere: https://api.cohere.ai/v1/rerank -- Jina: https://api.jina.ai/v1/rerank -""" - -from __future__ import annotations - -import logging -import os -import random -import time -from pathlib import Path -from typing import Any, Mapping, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -_DEFAULT_ENV_API_KEY = "RERANKER_API_KEY" - - -def _get_env_with_fallback(key: str, workspace_root: Path | None = None) -> str | None: - """Get environment variable with .env file fallback.""" - # Check os.environ first - if key in os.environ: - return os.environ[key] - - # Try loading from .env files - try: - from codexlens.env_config import get_env - return get_env(key, workspace_root=workspace_root) - except ImportError: - return None - - -def check_httpx_available() -> tuple[bool, str | None]: - try: - import httpx # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return False, f"httpx not available: {exc}. Install with: pip install httpx" - return True, None - - -class APIReranker(BaseReranker): - """Reranker backed by a remote reranking HTTP API.""" - - _PROVIDER_DEFAULTS: Mapping[str, Mapping[str, str]] = { - "siliconflow": { - "api_base": "https://api.siliconflow.cn", - "endpoint": "/v1/rerank", - "default_model": "BAAI/bge-reranker-v2-m3", - }, - "cohere": { - "api_base": "https://api.cohere.ai", - "endpoint": "/v1/rerank", - "default_model": "rerank-english-v3.0", - }, - "jina": { - "api_base": "https://api.jina.ai", - "endpoint": "/v1/rerank", - "default_model": "jina-reranker-v2-base-multilingual", - }, - } - - def __init__( - self, - *, - provider: str = "siliconflow", - model_name: str | None = None, - api_key: str | None = None, - api_base: str | None = None, - timeout: float = 30.0, - max_retries: int = 3, - backoff_base_s: float = 0.5, - backoff_max_s: float = 8.0, - env_api_key: str = _DEFAULT_ENV_API_KEY, - workspace_root: Path | str | None = None, - max_input_tokens: int | None = None, - ) -> None: - ok, err = check_httpx_available() - if not ok: # pragma: no cover - exercised via factory availability tests - raise ImportError(err) - - import httpx - - self._workspace_root = Path(workspace_root) if workspace_root else None - - self.provider = (provider or "").strip().lower() - if self.provider not in self._PROVIDER_DEFAULTS: - raise ValueError( - f"Unknown reranker provider: {provider}. " - f"Supported providers: {', '.join(sorted(self._PROVIDER_DEFAULTS))}" - ) - - defaults = self._PROVIDER_DEFAULTS[self.provider] - - # Load api_base from env with .env fallback - env_api_base = _get_env_with_fallback("RERANKER_API_BASE", self._workspace_root) - self.api_base = (api_base or env_api_base or defaults["api_base"]).strip().rstrip("/") - self.endpoint = defaults["endpoint"] - - # Load model from env with .env fallback - env_model = _get_env_with_fallback("RERANKER_MODEL", self._workspace_root) - self.model_name = (model_name or env_model or defaults["default_model"]).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - # Load API key from env with .env fallback - resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or "" - resolved_key = resolved_key.strip() - if not resolved_key: - raise ValueError( - f"Missing API key for reranker provider '{self.provider}'. " - f"Pass api_key=... or set ${env_api_key}." - ) - self._api_key = resolved_key - - self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0 - self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3 - self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5 - self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0 - - headers = { - "Authorization": f"Bearer {self._api_key}", - "Content-Type": "application/json", - } - if self.provider == "cohere": - headers.setdefault("Cohere-Version", "2022-12-06") - - self._client = httpx.Client( - base_url=self.api_base, - headers=headers, - timeout=self.timeout_s, - ) - - # Store max_input_tokens with model-aware defaults - if max_input_tokens is not None: - self._max_input_tokens = max_input_tokens - else: - # Infer from model name - model_lower = self.model_name.lower() - if '8b' in model_lower or 'large' in model_lower: - self._max_input_tokens = 32768 - else: - self._max_input_tokens = 8192 - - @property - def max_input_tokens(self) -> int: - """Return maximum token limit for reranking.""" - return self._max_input_tokens - - def close(self) -> None: - try: - self._client.close() - except Exception: # pragma: no cover - defensive - return - - def _sleep_backoff(self, attempt: int, *, retry_after_s: float | None = None) -> None: - if retry_after_s is not None and retry_after_s > 0: - time.sleep(min(float(retry_after_s), self.backoff_max_s)) - return - - exp = self.backoff_base_s * (2**attempt) - jitter = random.uniform(0, min(0.5, self.backoff_base_s)) - time.sleep(min(self.backoff_max_s, exp + jitter)) - - @staticmethod - def _parse_retry_after_seconds(headers: Mapping[str, str]) -> float | None: - value = (headers.get("Retry-After") or "").strip() - if not value: - return None - try: - return float(value) - except ValueError: - return None - - @staticmethod - def _should_retry_status(status_code: int) -> bool: - return status_code == 429 or 500 <= status_code <= 599 - - def _request_json(self, payload: Mapping[str, Any]) -> Mapping[str, Any]: - last_exc: Exception | None = None - - for attempt in range(self.max_retries + 1): - try: - response = self._client.post(self.endpoint, json=dict(payload)) - except Exception as exc: # httpx is optional at import-time - last_exc = exc - if attempt < self.max_retries: - self._sleep_backoff(attempt) - continue - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}' after " - f"{self.max_retries + 1} attempts: {type(exc).__name__}: {exc}" - ) from exc - - status = int(getattr(response, "status_code", 0) or 0) - if status >= 400: - body_preview = "" - try: - body_preview = (response.text or "").strip() - except Exception: - body_preview = "" - if len(body_preview) > 300: - body_preview = body_preview[:300] + "…" - - if self._should_retry_status(status) and attempt < self.max_retries: - retry_after = self._parse_retry_after_seconds(response.headers) - logger.warning( - "Rerank request to %s%s failed with HTTP %s (attempt %s/%s). Retrying…", - self.api_base, - self.endpoint, - status, - attempt + 1, - self.max_retries + 1, - ) - self._sleep_backoff(attempt, retry_after_s=retry_after) - continue - - if status in {401, 403}: - raise RuntimeError( - f"Rerank request unauthorized for provider '{self.provider}' (HTTP {status}). " - "Check your API key." - ) - - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}' (HTTP {status}). " - f"Response: {body_preview or ''}" - ) - - try: - data = response.json() - except Exception as exc: - raise RuntimeError( - f"Rerank response from provider '{self.provider}' is not valid JSON: " - f"{type(exc).__name__}: {exc}" - ) from exc - - if not isinstance(data, dict): - raise RuntimeError( - f"Rerank response from provider '{self.provider}' must be a JSON object; " - f"got {type(data).__name__}" - ) - - return data - - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}'. Last error: {last_exc}" - ) - - @staticmethod - def _extract_scores_from_results(results: Any, expected: int) -> list[float]: - if not isinstance(results, list): - raise RuntimeError(f"Invalid rerank response: 'results' must be a list, got {type(results).__name__}") - - scores: list[float] = [0.0 for _ in range(expected)] - filled = 0 - - for item in results: - if not isinstance(item, dict): - continue - idx = item.get("index") - score = item.get("relevance_score", item.get("score")) - if idx is None or score is None: - continue - try: - idx_int = int(idx) - score_f = float(score) - except (TypeError, ValueError): - continue - if 0 <= idx_int < expected: - scores[idx_int] = score_f - filled += 1 - - if filled != expected: - raise RuntimeError( - f"Rerank response contained {filled}/{expected} scored documents; " - "ensure top_n matches the number of documents." - ) - - return scores - - def _build_payload(self, *, query: str, documents: Sequence[str]) -> Mapping[str, Any]: - payload: dict[str, Any] = { - "model": self.model_name, - "query": query, - "documents": list(documents), - "top_n": len(documents), - "return_documents": False, - } - return payload - - def _estimate_tokens(self, text: str) -> int: - """Estimate token count using fast heuristic. - - Uses len(text) // 4 as approximation (~4 chars per token for English). - Not perfectly accurate for all models/languages but sufficient for - batch sizing decisions where exact counts aren't critical. - """ - return len(text) // 4 - - def _create_token_aware_batches( - self, - query: str, - documents: Sequence[str], - ) -> list[list[tuple[int, str]]]: - """Split documents into batches that fit within token limits. - - Uses 90% of max_input_tokens as safety margin. - Each batch includes the query tokens overhead. - """ - max_tokens = int(self._max_input_tokens * 0.9) - query_tokens = self._estimate_tokens(query) - - batches: list[list[tuple[int, str]]] = [] - current_batch: list[tuple[int, str]] = [] - current_tokens = query_tokens # Start with query overhead - - for idx, doc in enumerate(documents): - doc_tokens = self._estimate_tokens(doc) - - # Warn if single document exceeds token limit (will be truncated by API) - if doc_tokens > max_tokens - query_tokens: - logger.warning( - f"Document {idx} exceeds token limit: ~{doc_tokens} tokens " - f"(limit: {max_tokens - query_tokens} after query overhead). " - "Document will likely be truncated by the API." - ) - - # If batch would exceed limit, start new batch - if current_tokens + doc_tokens > max_tokens and current_batch: - batches.append(current_batch) - current_batch = [] - current_tokens = query_tokens - - current_batch.append((idx, doc)) - current_tokens += doc_tokens - - if current_batch: - batches.append(current_batch) - - return batches - - def _rerank_one_query(self, *, query: str, documents: Sequence[str]) -> list[float]: - if not documents: - return [] - - # Create token-aware batches - batches = self._create_token_aware_batches(query, documents) - - if len(batches) == 1: - # Single batch - original behavior - payload = self._build_payload(query=query, documents=documents) - data = self._request_json(payload) - results = data.get("results") - return self._extract_scores_from_results(results, expected=len(documents)) - - # Multiple batches - process each and merge results - logger.info( - f"Splitting {len(documents)} documents into {len(batches)} batches " - f"(max_input_tokens: {self._max_input_tokens})" - ) - - all_scores: list[float] = [0.0] * len(documents) - - for batch in batches: - batch_docs = [doc for _, doc in batch] - payload = self._build_payload(query=query, documents=batch_docs) - data = self._request_json(payload) - results = data.get("results") - batch_scores = self._extract_scores_from_results(results, expected=len(batch_docs)) - - # Map scores back to original indices - for (orig_idx, _), score in zip(batch, batch_scores): - all_scores[orig_idx] = score - - return all_scores - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, # noqa: ARG002 - kept for BaseReranker compatibility - ) -> list[float]: - if not pairs: - return [] - - grouped: dict[str, list[tuple[int, str]]] = {} - for idx, (query, doc) in enumerate(pairs): - grouped.setdefault(str(query), []).append((idx, str(doc))) - - scores: list[float] = [0.0 for _ in range(len(pairs))] - - for query, items in grouped.items(): - documents = [doc for _, doc in items] - query_scores = self._rerank_one_query(query=query, documents=documents) - for (orig_idx, _), score in zip(items, query_scores): - scores[orig_idx] = float(score) - - return scores diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/base.py b/codex-lens/build/lib/codexlens/semantic/reranker/base.py deleted file mode 100644 index 65c2d837..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/base.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Base class for rerankers. - -Defines the interface that all rerankers must implement. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Sequence - - -class BaseReranker(ABC): - """Base class for all rerankers. - - All reranker implementations must inherit from this class and implement - the abstract methods to ensure a consistent interface. - """ - - @property - def max_input_tokens(self) -> int: - """Return maximum token limit for reranking. - - Returns: - int: Maximum number of tokens that can be processed at once. - Default is 8192 if not overridden by implementation. - """ - return 8192 - - @abstractmethod - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs. - - Args: - pairs: Sequence of (query, doc) string pairs to score. - batch_size: Batch size for scoring. - - Returns: - List of scores (one per pair). - """ - ... - diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/factory.py b/codex-lens/build/lib/codexlens/semantic/reranker/factory.py deleted file mode 100644 index 5dccc758..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/factory.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Factory for creating rerankers. - -Provides a unified interface for instantiating different reranker backends. -""" - -from __future__ import annotations - -from typing import Any - -from .base import BaseReranker - - -def check_reranker_available(backend: str) -> tuple[bool, str | None]: - """Check whether a specific reranker backend can be used. - - Notes: - - "fastembed" uses fastembed TextCrossEncoder (pip install fastembed>=0.4.0). [Recommended] - - "onnx" redirects to "fastembed" for backward compatibility. - - "legacy" uses sentence-transformers CrossEncoder (pip install codexlens[reranker-legacy]). - - "api" uses a remote reranking HTTP API (requires httpx). - - "litellm" uses `ccw-litellm` for unified access to LLM providers. - """ - backend = (backend or "").strip().lower() - - if backend == "legacy": - from .legacy import check_cross_encoder_available - - return check_cross_encoder_available() - - if backend == "fastembed": - from .fastembed_reranker import check_fastembed_reranker_available - - return check_fastembed_reranker_available() - - if backend == "onnx": - # Redirect to fastembed for backward compatibility - from .fastembed_reranker import check_fastembed_reranker_available - - return check_fastembed_reranker_available() - - if backend == "litellm": - try: - import ccw_litellm # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"ccw-litellm not available: {exc}. Install with: pip install ccw-litellm", - ) - - try: - from .litellm_reranker import LiteLLMReranker # noqa: F401 - except Exception as exc: # pragma: no cover - defensive - return False, f"LiteLLM reranker backend not available: {exc}" - - return True, None - - if backend == "api": - from .api_reranker import check_httpx_available - - return check_httpx_available() - - return False, ( - f"Invalid reranker backend: {backend}. " - "Must be 'fastembed', 'onnx', 'api', 'litellm', or 'legacy'." - ) - - -def get_reranker( - backend: str = "fastembed", - model_name: str | None = None, - *, - device: str | None = None, - **kwargs: Any, -) -> BaseReranker: - """Factory function to create reranker based on backend. - - Args: - backend: Reranker backend to use. Options: - - "fastembed": FastEmbed TextCrossEncoder backend (default, recommended) - - "onnx": Redirects to fastembed for backward compatibility - - "api": HTTP API backend (remote providers) - - "litellm": LiteLLM backend (LLM-based, for API mode) - - "legacy": sentence-transformers CrossEncoder backend (optional) - model_name: Model identifier for model-based backends. Defaults depend on backend: - - fastembed: Xenova/ms-marco-MiniLM-L-6-v2 - - onnx: (redirects to fastembed) - - api: BAAI/bge-reranker-v2-m3 (SiliconFlow) - - legacy: cross-encoder/ms-marco-MiniLM-L-6-v2 - - litellm: default - device: Optional device string for backends that support it (legacy only). - **kwargs: Additional backend-specific arguments. - - Returns: - BaseReranker: Configured reranker instance. - - Raises: - ValueError: If backend is not recognized. - ImportError: If required backend dependencies are not installed or backend is unavailable. - """ - backend = (backend or "").strip().lower() - - if backend == "fastembed": - ok, err = check_reranker_available("fastembed") - if not ok: - raise ImportError(err) - - from .fastembed_reranker import FastEmbedReranker - - resolved_model_name = (model_name or "").strip() or FastEmbedReranker.DEFAULT_MODEL - _ = device # Device selection is managed via fastembed providers. - return FastEmbedReranker(model_name=resolved_model_name, **kwargs) - - if backend == "onnx": - # Redirect to fastembed for backward compatibility - ok, err = check_reranker_available("fastembed") - if not ok: - raise ImportError(err) - - from .fastembed_reranker import FastEmbedReranker - - resolved_model_name = (model_name or "").strip() or FastEmbedReranker.DEFAULT_MODEL - _ = device # Device selection is managed via fastembed providers. - return FastEmbedReranker(model_name=resolved_model_name, **kwargs) - - if backend == "legacy": - ok, err = check_reranker_available("legacy") - if not ok: - raise ImportError(err) - - from .legacy import CrossEncoderReranker - - resolved_model_name = (model_name or "").strip() or "cross-encoder/ms-marco-MiniLM-L-6-v2" - return CrossEncoderReranker(model_name=resolved_model_name, device=device) - - if backend == "litellm": - ok, err = check_reranker_available("litellm") - if not ok: - raise ImportError(err) - - from .litellm_reranker import LiteLLMReranker - - _ = device # Device selection is not applicable to remote LLM backends. - resolved_model_name = (model_name or "").strip() or "default" - return LiteLLMReranker(model=resolved_model_name, **kwargs) - - if backend == "api": - ok, err = check_reranker_available("api") - if not ok: - raise ImportError(err) - - from .api_reranker import APIReranker - - _ = device # Device selection is not applicable to remote HTTP backends. - resolved_model_name = (model_name or "").strip() or None - return APIReranker(model_name=resolved_model_name, **kwargs) - - raise ValueError( - f"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'" - ) diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/fastembed_reranker.py b/codex-lens/build/lib/codexlens/semantic/reranker/fastembed_reranker.py deleted file mode 100644 index c38d4aa0..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/fastembed_reranker.py +++ /dev/null @@ -1,257 +0,0 @@ -"""FastEmbed-based reranker backend. - -This reranker uses fastembed's TextCrossEncoder for cross-encoder reranking. -FastEmbed is ONNX-based internally but provides a cleaner, unified API. - -Install: - pip install fastembed>=0.4.0 -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - - -def check_fastembed_reranker_available() -> tuple[bool, str | None]: - """Check whether fastembed reranker dependencies are available.""" - try: - import fastembed # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"fastembed not available: {exc}. Install with: pip install fastembed>=0.4.0", - ) - - try: - from fastembed.rerank.cross_encoder import TextCrossEncoder # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"fastembed TextCrossEncoder not available: {exc}. " - "Upgrade with: pip install fastembed>=0.4.0", - ) - - return True, None - - -class FastEmbedReranker(BaseReranker): - """Cross-encoder reranker using fastembed's TextCrossEncoder with lazy loading.""" - - DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" - - # Alternative models supported by fastembed: - # - "BAAI/bge-reranker-base" - # - "BAAI/bge-reranker-large" - # - "cross-encoder/ms-marco-MiniLM-L-6-v2" - - def __init__( - self, - model_name: str | None = None, - *, - use_gpu: bool = True, - cache_dir: str | None = None, - threads: int | None = None, - ) -> None: - """Initialize FastEmbed reranker. - - Args: - model_name: Model identifier. Defaults to Xenova/ms-marco-MiniLM-L-6-v2. - use_gpu: Whether to use GPU acceleration when available. - cache_dir: Optional directory for caching downloaded models. - threads: Optional number of threads for ONNX Runtime. - """ - self.model_name = (model_name or self.DEFAULT_MODEL).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.use_gpu = bool(use_gpu) - self.cache_dir = cache_dir - self.threads = threads - - self._encoder: Any | None = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - """Lazy-load the TextCrossEncoder model.""" - if self._encoder is not None: - return - - ok, err = check_fastembed_reranker_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._encoder is not None: - return - - from fastembed.rerank.cross_encoder import TextCrossEncoder - - # Determine providers based on GPU preference - providers: list[str] | None = None - if self.use_gpu: - try: - from ..gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=True, with_device_options=False) - except Exception: - # Fallback: let fastembed decide - providers = None - - # Build initialization kwargs - init_kwargs: dict[str, Any] = {} - if self.cache_dir: - init_kwargs["cache_dir"] = self.cache_dir - if self.threads is not None: - init_kwargs["threads"] = self.threads - if providers: - init_kwargs["providers"] = providers - - logger.debug( - "Loading FastEmbed reranker model: %s (use_gpu=%s)", - self.model_name, - self.use_gpu, - ) - - self._encoder = TextCrossEncoder( - model_name=self.model_name, - **init_kwargs, - ) - - logger.debug("FastEmbed reranker model loaded successfully") - - @staticmethod - def _sigmoid(x: float) -> float: - """Numerically stable sigmoid function.""" - if x < -709: - return 0.0 - if x > 709: - return 1.0 - import math - return 1.0 / (1.0 + math.exp(-x)) - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs. - - Args: - pairs: Sequence of (query, doc) string pairs to score. - batch_size: Batch size for scoring. - - Returns: - List of scores (one per pair), normalized to [0, 1] range. - """ - if not pairs: - return [] - - self._load_model() - - if self._encoder is None: # pragma: no cover - defensive - return [] - - # FastEmbed's TextCrossEncoder.rerank() expects a query and list of documents. - # For batch scoring of multiple query-doc pairs, we need to process them. - # Group by query for efficiency when same query appears multiple times. - query_to_docs: dict[str, list[tuple[int, str]]] = {} - for idx, (query, doc) in enumerate(pairs): - if query not in query_to_docs: - query_to_docs[query] = [] - query_to_docs[query].append((idx, doc)) - - # Score each query group - scores: list[float] = [0.0] * len(pairs) - - for query, indexed_docs in query_to_docs.items(): - docs = [doc for _, doc in indexed_docs] - indices = [idx for idx, _ in indexed_docs] - - try: - # TextCrossEncoder.rerank returns raw float scores in same order as input - raw_scores = list( - self._encoder.rerank( - query=query, - documents=docs, - batch_size=batch_size, - ) - ) - - # Map scores back to original positions and normalize with sigmoid - for i, raw_score in enumerate(raw_scores): - if i < len(indices): - original_idx = indices[i] - # Normalize score to [0, 1] using stable sigmoid - scores[original_idx] = self._sigmoid(float(raw_score)) - - except Exception as e: - logger.warning("FastEmbed rerank failed for query: %s", str(e)[:100]) - # Leave scores as 0.0 for failed queries - - return scores - - def rerank( - self, - query: str, - documents: Sequence[str], - *, - top_k: int | None = None, - batch_size: int = 32, - ) -> list[tuple[float, str, int]]: - """Rerank documents for a single query. - - This is a convenience method that provides results in ranked order. - - Args: - query: The query string. - documents: List of documents to rerank. - top_k: Return only top K results. None returns all. - batch_size: Batch size for scoring. - - Returns: - List of (score, document, original_index) tuples, sorted by score descending. - """ - if not documents: - return [] - - self._load_model() - - if self._encoder is None: # pragma: no cover - defensive - return [] - - try: - # TextCrossEncoder.rerank returns raw float scores in same order as input - raw_scores = list( - self._encoder.rerank( - query=query, - documents=list(documents), - batch_size=batch_size, - ) - ) - - # Convert to our format: (normalized_score, document, original_index) - ranked = [] - for idx, raw_score in enumerate(raw_scores): - if idx < len(documents): - # Normalize score to [0, 1] using stable sigmoid - normalized = self._sigmoid(float(raw_score)) - ranked.append((normalized, documents[idx], idx)) - - # Sort by score descending - ranked.sort(key=lambda x: x[0], reverse=True) - - if top_k is not None and top_k > 0: - ranked = ranked[:top_k] - - return ranked - - except Exception as e: - logger.warning("FastEmbed rerank failed: %s", str(e)[:100]) - return [] diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/legacy.py b/codex-lens/build/lib/codexlens/semantic/reranker/legacy.py deleted file mode 100644 index a5ee05de..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/legacy.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Legacy sentence-transformers cross-encoder reranker. - -Install with: pip install codexlens[reranker-legacy] -""" - -from __future__ import annotations - -import logging -import threading -from typing import List, Sequence, Tuple - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -try: - from sentence_transformers import CrossEncoder as _CrossEncoder - - CROSS_ENCODER_AVAILABLE = True - _import_error: str | None = None -except ImportError as exc: # pragma: no cover - optional dependency - _CrossEncoder = None # type: ignore[assignment] - CROSS_ENCODER_AVAILABLE = False - _import_error = str(exc) - - -def check_cross_encoder_available() -> tuple[bool, str | None]: - if CROSS_ENCODER_AVAILABLE: - return True, None - return ( - False, - _import_error - or "sentence-transformers not available. Install with: pip install codexlens[reranker-legacy]", - ) - - -class CrossEncoderReranker(BaseReranker): - """Cross-encoder reranker with lazy model loading.""" - - def __init__(self, model_name: str, *, device: str | None = None) -> None: - self.model_name = (model_name or "").strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.device = (device or "").strip() or None - self._model = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - if self._model is not None: - return - - ok, err = check_cross_encoder_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._model is not None: - return - - try: - if self.device: - self._model = _CrossEncoder(self.model_name, device=self.device) # type: ignore[misc] - else: - self._model = _CrossEncoder(self.model_name) # type: ignore[misc] - except Exception as exc: - logger.debug("Failed to load cross-encoder model %s: %s", self.model_name, exc) - raise - - def score_pairs( - self, - pairs: Sequence[Tuple[str, str]], - *, - batch_size: int = 32, - ) -> List[float]: - """Score (query, doc) pairs using the cross-encoder. - - Returns: - List of scores (one per pair) in the model's native scale (usually logits). - """ - if not pairs: - return [] - - self._load_model() - - if self._model is None: # pragma: no cover - defensive - return [] - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - scores = self._model.predict(list(pairs), batch_size=bs) # type: ignore[union-attr] - return [float(s) for s in scores] diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/litellm_reranker.py b/codex-lens/build/lib/codexlens/semantic/reranker/litellm_reranker.py deleted file mode 100644 index ec735994..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/litellm_reranker.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Experimental LiteLLM reranker backend. - -This module provides :class:`LiteLLMReranker`, which uses an LLM to score the -relevance of a single (query, document) pair per request. - -Notes: - - This backend is experimental and may be slow/expensive compared to local - rerankers. - - It relies on `ccw-litellm` for a unified LLM API across providers. -""" - -from __future__ import annotations - -import json -import logging -import re -import threading -import time -from typing import Any, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -_NUMBER_RE = re.compile(r"[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?") - - -def _coerce_score_to_unit_interval(score: float) -> float: - """Coerce a numeric score into [0, 1]. - - The prompt asks for a float in [0, 1], but some models may respond with 0-10 - or 0-100 scales. This function attempts a conservative normalization. - """ - if 0.0 <= score <= 1.0: - return score - if 0.0 <= score <= 10.0: - return score / 10.0 - if 0.0 <= score <= 100.0: - return score / 100.0 - return max(0.0, min(1.0, score)) - - -def _extract_score(text: str) -> float | None: - """Extract a numeric relevance score from an LLM response.""" - content = (text or "").strip() - if not content: - return None - - # Prefer JSON if present. - if "{" in content and "}" in content: - try: - start = content.index("{") - end = content.rindex("}") + 1 - payload = json.loads(content[start:end]) - if isinstance(payload, dict) and "score" in payload: - return float(payload["score"]) - except Exception: - pass - - match = _NUMBER_RE.search(content) - if not match: - return None - try: - return float(match.group(0)) - except ValueError: - return None - - -class LiteLLMReranker(BaseReranker): - """Experimental reranker that uses a LiteLLM-compatible model. - - This reranker scores each (query, doc) pair in isolation (single-pair mode) - to improve prompt reliability across providers. - """ - - _SYSTEM_PROMPT = ( - "You are a relevance scoring assistant.\n" - "Given a search query and a document snippet, output a single numeric " - "relevance score between 0 and 1.\n\n" - "Scoring guidance:\n" - "- 1.0: The document directly answers the query.\n" - "- 0.5: The document is partially relevant.\n" - "- 0.0: The document is unrelated.\n\n" - "Output requirements:\n" - "- Output ONLY the number (e.g., 0.73).\n" - "- Do not include any other text." - ) - - def __init__( - self, - model: str = "default", - *, - requests_per_minute: float | None = None, - min_interval_seconds: float | None = None, - default_score: float = 0.0, - max_doc_chars: int = 8000, - **litellm_kwargs: Any, - ) -> None: - """Initialize the reranker. - - Args: - model: Model name from ccw-litellm configuration (default: "default"). - requests_per_minute: Optional rate limit in requests per minute. - min_interval_seconds: Optional minimum interval between requests. If set, - it takes precedence over requests_per_minute. - default_score: Score to use when an API call fails or parsing fails. - max_doc_chars: Maximum number of document characters to include in the prompt. - **litellm_kwargs: Passed through to `ccw_litellm.LiteLLMClient`. - - Raises: - ImportError: If ccw-litellm is not installed. - ValueError: If model is blank. - """ - self.model_name = (model or "").strip() - if not self.model_name: - raise ValueError("model cannot be blank") - - self.default_score = float(default_score) - - self.max_doc_chars = int(max_doc_chars) if int(max_doc_chars) > 0 else 0 - - if min_interval_seconds is not None: - self._min_interval_seconds = max(0.0, float(min_interval_seconds)) - elif requests_per_minute is not None and float(requests_per_minute) > 0: - self._min_interval_seconds = 60.0 / float(requests_per_minute) - else: - self._min_interval_seconds = 0.0 - - # Prefer deterministic output by default; allow overrides via kwargs. - litellm_kwargs = dict(litellm_kwargs) - litellm_kwargs.setdefault("temperature", 0.0) - litellm_kwargs.setdefault("max_tokens", 16) - - try: - from ccw_litellm import ChatMessage, LiteLLMClient - except ImportError as exc: # pragma: no cover - optional dependency - raise ImportError( - "ccw-litellm not installed. Install with: pip install ccw-litellm" - ) from exc - - self._ChatMessage = ChatMessage - self._client = LiteLLMClient(model=self.model_name, **litellm_kwargs) - - self._lock = threading.RLock() - self._last_request_at = 0.0 - - def _sanitize_text(self, text: str) -> str: - # Keep consistent with LiteLLMEmbedderWrapper workaround. - if text.startswith("import"): - return " " + text - return text - - def _rate_limit(self) -> None: - if self._min_interval_seconds <= 0: - return - with self._lock: - now = time.monotonic() - elapsed = now - self._last_request_at - if elapsed < self._min_interval_seconds: - time.sleep(self._min_interval_seconds - elapsed) - self._last_request_at = time.monotonic() - - def _build_user_prompt(self, query: str, doc: str) -> str: - sanitized_query = self._sanitize_text(query or "") - sanitized_doc = self._sanitize_text(doc or "") - if self.max_doc_chars and len(sanitized_doc) > self.max_doc_chars: - sanitized_doc = sanitized_doc[: self.max_doc_chars] - - return ( - "Query:\n" - f"{sanitized_query}\n\n" - "Document:\n" - f"{sanitized_doc}\n\n" - "Return the relevance score (0 to 1) as a single number:" - ) - - def _score_single_pair(self, query: str, doc: str) -> float: - messages = [ - self._ChatMessage(role="system", content=self._SYSTEM_PROMPT), - self._ChatMessage(role="user", content=self._build_user_prompt(query, doc)), - ] - - try: - self._rate_limit() - response = self._client.chat(messages) - except Exception as exc: - logger.debug("LiteLLM reranker request failed: %s", exc) - return self.default_score - - raw = getattr(response, "content", "") or "" - score = _extract_score(raw) - if score is None: - logger.debug("Failed to parse LiteLLM reranker score from response: %r", raw) - return self.default_score - return _coerce_score_to_unit_interval(float(score)) - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs with per-pair LLM calls.""" - if not pairs: - return [] - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - - scores: list[float] = [] - for i in range(0, len(pairs), bs): - batch = pairs[i : i + bs] - for query, doc in batch: - scores.append(self._score_single_pair(query, doc)) - return scores diff --git a/codex-lens/build/lib/codexlens/semantic/reranker/onnx_reranker.py b/codex-lens/build/lib/codexlens/semantic/reranker/onnx_reranker.py deleted file mode 100644 index 0b22f45e..00000000 --- a/codex-lens/build/lib/codexlens/semantic/reranker/onnx_reranker.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Optimum + ONNX Runtime reranker backend. - -This reranker uses Hugging Face Optimum's ONNXRuntime backend for sequence -classification models. It is designed to run without requiring PyTorch at -runtime by using numpy tensors and ONNX Runtime execution providers. - -Install (CPU): - pip install onnxruntime optimum[onnxruntime] transformers -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Iterable, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - - -def check_onnx_reranker_available() -> tuple[bool, str | None]: - """Check whether Optimum + ONNXRuntime reranker dependencies are available.""" - try: - import numpy # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return False, f"numpy not available: {exc}. Install with: pip install numpy" - - try: - import onnxruntime # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"onnxruntime not available: {exc}. Install with: pip install onnxruntime", - ) - - try: - from optimum.onnxruntime import ORTModelForSequenceClassification # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"optimum[onnxruntime] not available: {exc}. Install with: pip install optimum[onnxruntime]", - ) - - try: - from transformers import AutoTokenizer # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"transformers not available: {exc}. Install with: pip install transformers", - ) - - return True, None - - -def _iter_batches(items: Sequence[Any], batch_size: int) -> Iterable[Sequence[Any]]: - for i in range(0, len(items), batch_size): - yield items[i : i + batch_size] - - -class ONNXReranker(BaseReranker): - """Cross-encoder reranker using Optimum + ONNX Runtime with lazy loading.""" - - DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" - - def __init__( - self, - model_name: str | None = None, - *, - use_gpu: bool = True, - providers: list[Any] | None = None, - max_length: int | None = None, - ) -> None: - self.model_name = (model_name or self.DEFAULT_MODEL).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.use_gpu = bool(use_gpu) - self.providers = providers - - self.max_length = int(max_length) if max_length is not None else None - - self._tokenizer: Any | None = None - self._model: Any | None = None - self._model_input_names: set[str] | None = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - if self._model is not None and self._tokenizer is not None: - return - - ok, err = check_onnx_reranker_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._model is not None and self._tokenizer is not None: - return - - from inspect import signature - - from optimum.onnxruntime import ORTModelForSequenceClassification - from transformers import AutoTokenizer - - if self.providers is None: - from ..gpu_support import get_optimal_providers - - # Include device_id options for DirectML/CUDA selection when available. - self.providers = get_optimal_providers( - use_gpu=self.use_gpu, with_device_options=True - ) - - # Some Optimum versions accept `providers`, others accept a single `provider`. - # Prefer passing the full providers list, with a conservative fallback. - model_kwargs: dict[str, Any] = {} - try: - params = signature(ORTModelForSequenceClassification.from_pretrained).parameters - if "providers" in params: - model_kwargs["providers"] = self.providers - elif "provider" in params: - provider_name = "CPUExecutionProvider" - if self.providers: - first = self.providers[0] - provider_name = first[0] if isinstance(first, tuple) else str(first) - model_kwargs["provider"] = provider_name - except Exception: - model_kwargs = {} - - try: - self._model = ORTModelForSequenceClassification.from_pretrained( - self.model_name, - **model_kwargs, - ) - except TypeError: - # Fallback for older Optimum versions: retry without provider arguments. - self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name) - - self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) - - # Cache model input names to filter tokenizer outputs defensively. - input_names: set[str] | None = None - for attr in ("input_names", "model_input_names"): - names = getattr(self._model, attr, None) - if isinstance(names, (list, tuple)) and names: - input_names = {str(n) for n in names} - break - if input_names is None: - try: - session = getattr(self._model, "model", None) - if session is not None and hasattr(session, "get_inputs"): - input_names = {i.name for i in session.get_inputs()} - except Exception: - input_names = None - self._model_input_names = input_names - - @staticmethod - def _sigmoid(x: "Any") -> "Any": - import numpy as np - - x = np.clip(x, -50.0, 50.0) - return 1.0 / (1.0 + np.exp(-x)) - - @staticmethod - def _select_relevance_logit(logits: "Any") -> "Any": - import numpy as np - - arr = np.asarray(logits) - if arr.ndim == 0: - return arr.reshape(1) - if arr.ndim == 1: - return arr - if arr.ndim >= 2: - # Common cases: - # - Regression: (batch, 1) - # - Binary classification: (batch, 2) - if arr.shape[-1] == 1: - return arr[..., 0] - if arr.shape[-1] == 2: - # Convert 2-logit softmax into a single logit via difference. - return arr[..., 1] - arr[..., 0] - return arr.max(axis=-1) - return arr.reshape(-1) - - def _tokenize_batch(self, batch: Sequence[tuple[str, str]]) -> dict[str, Any]: - if self._tokenizer is None: - raise RuntimeError("Tokenizer not loaded") # pragma: no cover - defensive - - queries = [q for q, _ in batch] - docs = [d for _, d in batch] - - tokenizer_kwargs: dict[str, Any] = { - "text": queries, - "text_pair": docs, - "padding": True, - "truncation": True, - "return_tensors": "np", - } - - max_len = self.max_length - if max_len is None: - try: - model_max = int(getattr(self._tokenizer, "model_max_length", 0) or 0) - if 0 < model_max < 10_000: - max_len = model_max - else: - max_len = 512 - except Exception: - max_len = 512 - if max_len is not None and max_len > 0: - tokenizer_kwargs["max_length"] = int(max_len) - - encoded = self._tokenizer(**tokenizer_kwargs) - inputs = dict(encoded) - - # Some models do not accept token_type_ids; filter to known input names if available. - if self._model_input_names: - inputs = {k: v for k, v in inputs.items() if k in self._model_input_names} - - return inputs - - def _forward_logits(self, inputs: dict[str, Any]) -> Any: - if self._model is None: - raise RuntimeError("Model not loaded") # pragma: no cover - defensive - - outputs = self._model(**inputs) - if hasattr(outputs, "logits"): - return outputs.logits - if isinstance(outputs, dict) and "logits" in outputs: - return outputs["logits"] - if isinstance(outputs, (list, tuple)) and outputs: - return outputs[0] - raise RuntimeError("Unexpected model output format") # pragma: no cover - defensive - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs with sigmoid-normalized outputs in [0, 1].""" - if not pairs: - return [] - - self._load_model() - - if self._model is None or self._tokenizer is None: # pragma: no cover - defensive - return [] - - import numpy as np - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - scores: list[float] = [] - - for batch in _iter_batches(list(pairs), bs): - inputs = self._tokenize_batch(batch) - logits = self._forward_logits(inputs) - rel_logits = self._select_relevance_logit(logits) - probs = self._sigmoid(rel_logits) - probs = np.clip(probs, 0.0, 1.0) - scores.extend([float(p) for p in probs.reshape(-1).tolist()]) - - if len(scores) != len(pairs): - logger.debug( - "ONNX reranker produced %d scores for %d pairs", len(scores), len(pairs) - ) - return scores[: len(pairs)] - - return scores diff --git a/codex-lens/build/lib/codexlens/semantic/rotational_embedder.py b/codex-lens/build/lib/codexlens/semantic/rotational_embedder.py deleted file mode 100644 index ff0f41ac..00000000 --- a/codex-lens/build/lib/codexlens/semantic/rotational_embedder.py +++ /dev/null @@ -1,434 +0,0 @@ -"""Rotational embedder for multi-endpoint API load balancing. - -Provides intelligent load balancing across multiple LiteLLM embedding endpoints -to maximize throughput while respecting rate limits. -""" - -from __future__ import annotations - -import logging -import random -import threading -import time -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Dict, Iterable, List, Optional - -import numpy as np - -from .base import BaseEmbedder - -logger = logging.getLogger(__name__) - - -class EndpointStatus(Enum): - """Status of an API endpoint.""" - AVAILABLE = "available" - COOLING = "cooling" # Rate limited, temporarily unavailable - FAILED = "failed" # Permanent failure (auth error, etc.) - - -class SelectionStrategy(Enum): - """Strategy for selecting endpoints.""" - ROUND_ROBIN = "round_robin" - LATENCY_AWARE = "latency_aware" - WEIGHTED_RANDOM = "weighted_random" - - -@dataclass -class EndpointConfig: - """Configuration for a single API endpoint.""" - model: str - api_key: Optional[str] = None - api_base: Optional[str] = None - weight: float = 1.0 # Higher weight = more requests - max_concurrent: int = 4 # Max concurrent requests to this endpoint - - -@dataclass -class EndpointState: - """Runtime state for an endpoint.""" - config: EndpointConfig - embedder: Any = None # LiteLLMEmbedderWrapper instance - - # Health metrics - status: EndpointStatus = EndpointStatus.AVAILABLE - cooldown_until: float = 0.0 # Unix timestamp when cooldown ends - - # Performance metrics - total_requests: int = 0 - total_failures: int = 0 - avg_latency_ms: float = 0.0 - last_latency_ms: float = 0.0 - - # Concurrency tracking - active_requests: int = 0 - lock: threading.Lock = field(default_factory=threading.Lock) - - def is_available(self) -> bool: - """Check if endpoint is available for requests.""" - if self.status == EndpointStatus.FAILED: - return False - if self.status == EndpointStatus.COOLING: - if time.time() >= self.cooldown_until: - self.status = EndpointStatus.AVAILABLE - return True - return False - return True - - def set_cooldown(self, seconds: float) -> None: - """Put endpoint in cooldown state.""" - self.status = EndpointStatus.COOLING - self.cooldown_until = time.time() + seconds - logger.warning(f"Endpoint {self.config.model} cooling down for {seconds:.1f}s") - - def mark_failed(self) -> None: - """Mark endpoint as permanently failed.""" - self.status = EndpointStatus.FAILED - logger.error(f"Endpoint {self.config.model} marked as failed") - - def record_success(self, latency_ms: float) -> None: - """Record successful request.""" - self.total_requests += 1 - self.last_latency_ms = latency_ms - # Exponential moving average for latency - alpha = 0.3 - if self.avg_latency_ms == 0: - self.avg_latency_ms = latency_ms - else: - self.avg_latency_ms = alpha * latency_ms + (1 - alpha) * self.avg_latency_ms - - def record_failure(self) -> None: - """Record failed request.""" - self.total_requests += 1 - self.total_failures += 1 - - @property - def health_score(self) -> float: - """Calculate health score (0-1) based on metrics.""" - if not self.is_available(): - return 0.0 - - # Base score from success rate - if self.total_requests > 0: - success_rate = 1 - (self.total_failures / self.total_requests) - else: - success_rate = 1.0 - - # Latency factor (faster = higher score) - # Normalize: 100ms = 1.0, 1000ms = 0.1 - if self.avg_latency_ms > 0: - latency_factor = min(1.0, 100 / self.avg_latency_ms) - else: - latency_factor = 1.0 - - # Availability factor (less concurrent = more available) - if self.config.max_concurrent > 0: - availability = 1 - (self.active_requests / self.config.max_concurrent) - else: - availability = 1.0 - - # Combined score with weights - return (success_rate * 0.4 + latency_factor * 0.3 + availability * 0.3) * self.config.weight - - -class RotationalEmbedder(BaseEmbedder): - """Embedder that load balances across multiple API endpoints. - - Features: - - Intelligent endpoint selection based on latency and health - - Automatic failover on rate limits (429) and server errors - - Cooldown management to respect rate limits - - Thread-safe concurrent request handling - - Args: - endpoints: List of endpoint configurations - strategy: Selection strategy (default: latency_aware) - default_cooldown: Default cooldown seconds for rate limits (default: 60) - max_retries: Maximum retry attempts across all endpoints (default: 3) - """ - - def __init__( - self, - endpoints: List[EndpointConfig], - strategy: SelectionStrategy = SelectionStrategy.LATENCY_AWARE, - default_cooldown: float = 60.0, - max_retries: int = 3, - ) -> None: - if not endpoints: - raise ValueError("At least one endpoint must be provided") - - self.strategy = strategy - self.default_cooldown = default_cooldown - self.max_retries = max_retries - - # Initialize endpoint states - self._endpoints: List[EndpointState] = [] - self._lock = threading.Lock() - self._round_robin_index = 0 - - # Create embedder instances for each endpoint - from .litellm_embedder import LiteLLMEmbedderWrapper - - for config in endpoints: - # Build kwargs for LiteLLMEmbedderWrapper - kwargs: Dict[str, Any] = {} - if config.api_key: - kwargs["api_key"] = config.api_key - if config.api_base: - kwargs["api_base"] = config.api_base - - try: - embedder = LiteLLMEmbedderWrapper(model=config.model, **kwargs) - state = EndpointState(config=config, embedder=embedder) - self._endpoints.append(state) - logger.info(f"Initialized endpoint: {config.model}") - except Exception as e: - logger.error(f"Failed to initialize endpoint {config.model}: {e}") - - if not self._endpoints: - raise ValueError("Failed to initialize any endpoints") - - # Cache embedding properties from first endpoint - self._embedding_dim = self._endpoints[0].embedder.embedding_dim - self._model_name = f"rotational({len(self._endpoints)} endpoints)" - self._max_tokens = self._endpoints[0].embedder.max_tokens - - @property - def embedding_dim(self) -> int: - """Return embedding dimensions.""" - return self._embedding_dim - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def max_tokens(self) -> int: - """Return maximum token limit.""" - return self._max_tokens - - @property - def endpoint_count(self) -> int: - """Return number of configured endpoints.""" - return len(self._endpoints) - - @property - def available_endpoint_count(self) -> int: - """Return number of available endpoints.""" - return sum(1 for ep in self._endpoints if ep.is_available()) - - def get_endpoint_stats(self) -> List[Dict[str, Any]]: - """Get statistics for all endpoints.""" - stats = [] - for ep in self._endpoints: - stats.append({ - "model": ep.config.model, - "status": ep.status.value, - "total_requests": ep.total_requests, - "total_failures": ep.total_failures, - "avg_latency_ms": round(ep.avg_latency_ms, 2), - "health_score": round(ep.health_score, 3), - "active_requests": ep.active_requests, - }) - return stats - - def _select_endpoint(self) -> Optional[EndpointState]: - """Select best available endpoint based on strategy.""" - available = [ep for ep in self._endpoints if ep.is_available()] - - if not available: - return None - - if self.strategy == SelectionStrategy.ROUND_ROBIN: - with self._lock: - self._round_robin_index = (self._round_robin_index + 1) % len(available) - return available[self._round_robin_index] - - elif self.strategy == SelectionStrategy.LATENCY_AWARE: - # Sort by health score (descending) and pick top candidate - # Add small random factor to prevent thundering herd - scored = [(ep, ep.health_score + random.uniform(0, 0.1)) for ep in available] - scored.sort(key=lambda x: x[1], reverse=True) - return scored[0][0] - - elif self.strategy == SelectionStrategy.WEIGHTED_RANDOM: - # Weighted random selection based on health scores - scores = [ep.health_score for ep in available] - total = sum(scores) - if total == 0: - return random.choice(available) - - weights = [s / total for s in scores] - return random.choices(available, weights=weights, k=1)[0] - - return available[0] - - def _parse_retry_after(self, error: Exception) -> Optional[float]: - """Extract Retry-After value from error if available.""" - error_str = str(error) - - # Try to find Retry-After in error message - import re - match = re.search(r'[Rr]etry[- ][Aa]fter[:\s]+(\d+)', error_str) - if match: - return float(match.group(1)) - - return None - - def _is_rate_limit_error(self, error: Exception) -> bool: - """Check if error is a rate limit error.""" - error_str = str(error).lower() - return any(x in error_str for x in ["429", "rate limit", "too many requests"]) - - def _is_retryable_error(self, error: Exception) -> bool: - """Check if error is retryable (not auth/config error).""" - error_str = str(error).lower() - # Retryable errors - if any(x in error_str for x in ["429", "rate limit", "502", "503", "504", - "timeout", "connection", "service unavailable"]): - return True - # Non-retryable errors (auth, config) - if any(x in error_str for x in ["401", "403", "invalid", "authentication", - "unauthorized", "api key"]): - return False - # Default to retryable for unknown errors - return True - - def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray: - """Embed texts using load-balanced endpoint selection. - - Args: - texts: Single text or iterable of texts to embed. - **kwargs: Additional arguments passed to underlying embedder. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - - Raises: - RuntimeError: If all endpoints fail after retries. - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - last_error: Optional[Exception] = None - tried_endpoints: set = set() - - for attempt in range(self.max_retries + 1): - endpoint = self._select_endpoint() - - if endpoint is None: - # All endpoints unavailable, wait for shortest cooldown - min_cooldown = min( - (ep.cooldown_until - time.time() for ep in self._endpoints - if ep.status == EndpointStatus.COOLING), - default=self.default_cooldown - ) - if min_cooldown > 0 and attempt < self.max_retries: - wait_time = min(min_cooldown, 30) # Cap wait at 30s - logger.warning(f"All endpoints busy, waiting {wait_time:.1f}s...") - time.sleep(wait_time) - continue - break - - # Track tried endpoints to avoid infinite loops - endpoint_id = id(endpoint) - if endpoint_id in tried_endpoints and len(tried_endpoints) >= len(self._endpoints): - # Already tried all endpoints - break - tried_endpoints.add(endpoint_id) - - # Acquire slot - with endpoint.lock: - endpoint.active_requests += 1 - - try: - start_time = time.time() - result = endpoint.embedder.embed_to_numpy(texts, **kwargs) - latency_ms = (time.time() - start_time) * 1000 - - # Record success - endpoint.record_success(latency_ms) - - return result - - except Exception as e: - last_error = e - endpoint.record_failure() - - if self._is_rate_limit_error(e): - # Rate limited - set cooldown - retry_after = self._parse_retry_after(e) or self.default_cooldown - endpoint.set_cooldown(retry_after) - logger.warning(f"Endpoint {endpoint.config.model} rate limited, " - f"cooling for {retry_after}s") - - elif not self._is_retryable_error(e): - # Permanent failure (auth error, etc.) - endpoint.mark_failed() - logger.error(f"Endpoint {endpoint.config.model} failed permanently: {e}") - - else: - # Temporary error - short cooldown - endpoint.set_cooldown(5.0) - logger.warning(f"Endpoint {endpoint.config.model} error: {e}") - - finally: - with endpoint.lock: - endpoint.active_requests -= 1 - - # All retries exhausted - available = self.available_endpoint_count - raise RuntimeError( - f"All embedding attempts failed after {self.max_retries + 1} tries. " - f"Available endpoints: {available}/{len(self._endpoints)}. " - f"Last error: {last_error}" - ) - - -def create_rotational_embedder( - endpoints_config: List[Dict[str, Any]], - strategy: str = "latency_aware", - default_cooldown: float = 60.0, -) -> RotationalEmbedder: - """Factory function to create RotationalEmbedder from config dicts. - - Args: - endpoints_config: List of endpoint configuration dicts with keys: - - model: Model identifier (required) - - api_key: API key (optional) - - api_base: API base URL (optional) - - weight: Request weight (optional, default 1.0) - - max_concurrent: Max concurrent requests (optional, default 4) - strategy: Selection strategy name (round_robin, latency_aware, weighted_random) - default_cooldown: Default cooldown seconds for rate limits - - Returns: - Configured RotationalEmbedder instance - - Example config: - endpoints_config = [ - {"model": "openai/text-embedding-3-small", "api_key": "sk-..."}, - {"model": "azure/my-embedding", "api_base": "https://...", "api_key": "..."}, - ] - """ - endpoints = [] - for cfg in endpoints_config: - endpoints.append(EndpointConfig( - model=cfg["model"], - api_key=cfg.get("api_key"), - api_base=cfg.get("api_base"), - weight=cfg.get("weight", 1.0), - max_concurrent=cfg.get("max_concurrent", 4), - )) - - strategy_enum = SelectionStrategy[strategy.upper()] - - return RotationalEmbedder( - endpoints=endpoints, - strategy=strategy_enum, - default_cooldown=default_cooldown, - ) diff --git a/codex-lens/build/lib/codexlens/semantic/splade_encoder.py b/codex-lens/build/lib/codexlens/semantic/splade_encoder.py deleted file mode 100644 index de92c69d..00000000 --- a/codex-lens/build/lib/codexlens/semantic/splade_encoder.py +++ /dev/null @@ -1,567 +0,0 @@ -"""ONNX-optimized SPLADE sparse encoder for code search. - -This module provides SPLADE (Sparse Lexical and Expansion) encoding using ONNX Runtime -for efficient sparse vector generation. SPLADE produces vocabulary-aligned sparse vectors -that combine the interpretability of BM25 with neural relevance modeling. - -Install (CPU): - pip install onnxruntime optimum[onnxruntime] transformers - -Install (GPU): - pip install onnxruntime-gpu optimum[onnxruntime-gpu] transformers -""" - -from __future__ import annotations - -import logging -import threading -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -logger = logging.getLogger(__name__) - - -def check_splade_available() -> Tuple[bool, Optional[str]]: - """Check whether SPLADE dependencies are available. - - Returns: - Tuple of (available: bool, error_message: Optional[str]) - """ - try: - import numpy # noqa: F401 - except ImportError as exc: - return False, f"numpy not available: {exc}. Install with: pip install numpy" - - try: - import onnxruntime # noqa: F401 - except ImportError as exc: - return ( - False, - f"onnxruntime not available: {exc}. Install with: pip install onnxruntime", - ) - - try: - from optimum.onnxruntime import ORTModelForMaskedLM # noqa: F401 - except ImportError as exc: - return ( - False, - f"optimum[onnxruntime] not available: {exc}. Install with: pip install optimum[onnxruntime]", - ) - - try: - from transformers import AutoTokenizer # noqa: F401 - except ImportError as exc: - return ( - False, - f"transformers not available: {exc}. Install with: pip install transformers", - ) - - return True, None - - -# Global cache for SPLADE encoders (singleton pattern) -_splade_cache: Dict[str, "SpladeEncoder"] = {} -_cache_lock = threading.RLock() - - -def get_splade_encoder( - model_name: str = "naver/splade-cocondenser-ensembledistil", - use_gpu: bool = True, - max_length: int = 512, - sparsity_threshold: float = 0.01, - cache_dir: Optional[str] = None, -) -> "SpladeEncoder": - """Get or create cached SPLADE encoder (thread-safe singleton). - - This function provides significant performance improvement by reusing - SpladeEncoder instances across multiple searches, avoiding repeated model - loading overhead. - - Args: - model_name: SPLADE model name (default: naver/splade-cocondenser-ensembledistil) - use_gpu: If True, use GPU acceleration when available - max_length: Maximum sequence length for tokenization - sparsity_threshold: Minimum weight to include in sparse vector - cache_dir: Directory to cache ONNX models (default: ~/.cache/codexlens/splade) - - Returns: - Cached SpladeEncoder instance for the given configuration - """ - global _splade_cache - - # Cache key includes all configuration parameters - cache_key = f"{model_name}:{'gpu' if use_gpu else 'cpu'}:{max_length}:{sparsity_threshold}" - - with _cache_lock: - encoder = _splade_cache.get(cache_key) - if encoder is not None: - return encoder - - # Create new encoder and cache it - encoder = SpladeEncoder( - model_name=model_name, - use_gpu=use_gpu, - max_length=max_length, - sparsity_threshold=sparsity_threshold, - cache_dir=cache_dir, - ) - # Pre-load model to ensure it's ready - encoder._load_model() - _splade_cache[cache_key] = encoder - - return encoder - - -def clear_splade_cache() -> None: - """Clear the SPLADE encoder cache and release ONNX resources. - - This method ensures proper cleanup of ONNX model resources to prevent - memory leaks when encoders are no longer needed. - """ - global _splade_cache - with _cache_lock: - # Release ONNX resources before clearing cache - for encoder in _splade_cache.values(): - if encoder._model is not None: - del encoder._model - encoder._model = None - if encoder._tokenizer is not None: - del encoder._tokenizer - encoder._tokenizer = None - _splade_cache.clear() - - -class SpladeEncoder: - """ONNX-optimized SPLADE sparse encoder. - - Produces sparse vectors with vocabulary-aligned dimensions. - Output: Dict[int, float] mapping token_id to weight. - - SPLADE activation formula: - splade_repr = log(1 + ReLU(logits)) * attention_mask - splade_vec = max_pooling(splade_repr, axis=sequence_length) - - References: - - SPLADE: https://arxiv.org/abs/2107.05720 - - SPLADE v2: https://arxiv.org/abs/2109.10086 - """ - - DEFAULT_MODEL = "naver/splade-cocondenser-ensembledistil" - - def __init__( - self, - model_name: str = DEFAULT_MODEL, - use_gpu: bool = True, - max_length: int = 512, - sparsity_threshold: float = 0.01, - providers: Optional[List[Any]] = None, - cache_dir: Optional[str] = None, - ) -> None: - """Initialize SPLADE encoder. - - Args: - model_name: SPLADE model name (default: naver/splade-cocondenser-ensembledistil) - use_gpu: If True, use GPU acceleration when available - max_length: Maximum sequence length for tokenization - sparsity_threshold: Minimum weight to include in sparse vector - providers: Explicit ONNX providers list (overrides use_gpu) - cache_dir: Directory to cache ONNX models (default: ~/.cache/codexlens/splade) - """ - self.model_name = (model_name or self.DEFAULT_MODEL).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.use_gpu = bool(use_gpu) - self.max_length = int(max_length) if max_length > 0 else 512 - self.sparsity_threshold = float(sparsity_threshold) - self.providers = providers - - # Setup ONNX cache directory - if cache_dir: - self._cache_dir = Path(cache_dir) - else: - self._cache_dir = Path.home() / ".cache" / "codexlens" / "splade" - - self._tokenizer: Any | None = None - self._model: Any | None = None - self._vocab_size: int | None = None - self._lock = threading.RLock() - - def _get_local_cache_path(self) -> Path: - """Get local cache path for this model's ONNX files. - - Returns: - Path to the local ONNX cache directory for this model - """ - # Replace / with -- for filesystem-safe naming - safe_name = self.model_name.replace("/", "--") - return self._cache_dir / safe_name - - def _load_model(self) -> None: - """Lazy load ONNX model and tokenizer. - - First checks local cache for ONNX model, falling back to - HuggingFace download and conversion if not cached. - """ - if self._model is not None and self._tokenizer is not None: - return - - ok, err = check_splade_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._model is not None and self._tokenizer is not None: - return - - from inspect import signature - - from optimum.onnxruntime import ORTModelForMaskedLM - from transformers import AutoTokenizer - - if self.providers is None: - from .gpu_support import get_optimal_providers, get_selected_device_id - - # Get providers as pure string list (cache-friendly) - # NOTE: with_device_options=False to avoid tuple-based providers - # which break optimum's caching mechanism - self.providers = get_optimal_providers( - use_gpu=self.use_gpu, with_device_options=False - ) - # Get device_id separately for provider_options - self._device_id = get_selected_device_id() if self.use_gpu else None - - # Some Optimum versions accept `providers`, others accept a single `provider` - # Prefer passing the full providers list, with a conservative fallback - model_kwargs: dict[str, Any] = {} - try: - params = signature(ORTModelForMaskedLM.from_pretrained).parameters - if "providers" in params: - model_kwargs["providers"] = self.providers - # Pass device_id via provider_options for GPU selection - if "provider_options" in params and hasattr(self, '_device_id') and self._device_id is not None: - # Build provider_options dict for each GPU provider - provider_options = {} - for p in self.providers: - if p in ("DmlExecutionProvider", "CUDAExecutionProvider", "ROCMExecutionProvider"): - provider_options[p] = {"device_id": self._device_id} - if provider_options: - model_kwargs["provider_options"] = provider_options - elif "provider" in params: - provider_name = "CPUExecutionProvider" - if self.providers: - first = self.providers[0] - provider_name = first[0] if isinstance(first, tuple) else str(first) - model_kwargs["provider"] = provider_name - except Exception as e: - logger.debug(f"Failed to inspect ORTModel signature: {e}") - model_kwargs = {} - - # Check for local ONNX cache first - local_cache = self._get_local_cache_path() - onnx_model_path = local_cache / "model.onnx" - - if onnx_model_path.exists(): - # Load from local cache - logger.info(f"Loading SPLADE from local cache: {local_cache}") - try: - self._model = ORTModelForMaskedLM.from_pretrained( - str(local_cache), - **model_kwargs, - ) - self._tokenizer = AutoTokenizer.from_pretrained( - str(local_cache), use_fast=True - ) - self._vocab_size = len(self._tokenizer) - logger.info( - f"SPLADE loaded from cache: {self.model_name}, vocab={self._vocab_size}" - ) - return - except Exception as e: - logger.warning(f"Failed to load from cache, redownloading: {e}") - - # Download and convert from HuggingFace - logger.info(f"Downloading SPLADE model: {self.model_name}") - try: - self._model = ORTModelForMaskedLM.from_pretrained( - self.model_name, - export=True, # Export to ONNX - **model_kwargs, - ) - logger.debug(f"SPLADE model loaded: {self.model_name}") - except TypeError: - # Fallback for older Optimum versions: retry without provider arguments - self._model = ORTModelForMaskedLM.from_pretrained( - self.model_name, - export=True, - ) - logger.warning( - "Optimum version doesn't support provider parameters. " - "Upgrade optimum for GPU acceleration: pip install --upgrade optimum" - ) - - self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) - - # Cache vocabulary size - self._vocab_size = len(self._tokenizer) - logger.debug(f"SPLADE tokenizer loaded: vocab_size={self._vocab_size}") - - # Save to local cache for future use - try: - local_cache.mkdir(parents=True, exist_ok=True) - self._model.save_pretrained(str(local_cache)) - self._tokenizer.save_pretrained(str(local_cache)) - logger.info(f"SPLADE model cached to: {local_cache}") - except Exception as e: - logger.warning(f"Failed to cache SPLADE model: {e}") - - @staticmethod - def _splade_activation(logits: Any, attention_mask: Any) -> Any: - """Apply SPLADE activation function to model outputs. - - Formula: log(1 + ReLU(logits)) * attention_mask - - Args: - logits: Model output logits (batch, seq_len, vocab_size) - attention_mask: Attention mask (batch, seq_len) - - Returns: - SPLADE representations (batch, seq_len, vocab_size) - """ - import numpy as np - - # ReLU activation - relu_logits = np.maximum(0, logits) - - # Log(1 + x) transformation - log_relu = np.log1p(relu_logits) - - # Apply attention mask (expand to match vocab dimension) - # attention_mask: (batch, seq_len) -> (batch, seq_len, 1) - mask_expanded = np.expand_dims(attention_mask, axis=-1) - - # Element-wise multiplication - splade_repr = log_relu * mask_expanded - - return splade_repr - - @staticmethod - def _max_pooling(splade_repr: Any) -> Any: - """Max pooling over sequence length dimension. - - Args: - splade_repr: SPLADE representations (batch, seq_len, vocab_size) - - Returns: - Pooled sparse vectors (batch, vocab_size) - """ - import numpy as np - - # Max pooling over sequence dimension (axis=1) - return np.max(splade_repr, axis=1) - - def _to_sparse_dict(self, dense_vec: Any) -> Dict[int, float]: - """Convert dense vector to sparse dictionary. - - Args: - dense_vec: Dense vector (vocab_size,) - - Returns: - Sparse dictionary {token_id: weight} with weights above threshold - """ - import numpy as np - - # Find non-zero indices above threshold - nonzero_indices = np.where(dense_vec > self.sparsity_threshold)[0] - - # Create sparse dictionary - sparse_dict = { - int(idx): float(dense_vec[idx]) - for idx in nonzero_indices - } - - return sparse_dict - - def warmup(self, text: str = "warmup query") -> None: - """Warmup the encoder by running a dummy inference. - - First-time model inference includes initialization overhead. - Call this method once before the first real search to avoid - latency spikes. - - Args: - text: Dummy text for warmup (default: "warmup query") - """ - logger.info("Warming up SPLADE encoder...") - # Trigger model loading and first inference - _ = self.encode_text(text) - logger.info("SPLADE encoder warmup complete") - - def encode_text(self, text: str) -> Dict[int, float]: - """Encode text to sparse vector {token_id: weight}. - - Args: - text: Input text to encode - - Returns: - Sparse vector as dictionary mapping token_id to weight - """ - self._load_model() - - if self._model is None or self._tokenizer is None: - raise RuntimeError("Model not loaded") - - import numpy as np - - # Tokenize input - encoded = self._tokenizer( - text, - padding=True, - truncation=True, - max_length=self.max_length, - return_tensors="np", - ) - - # Forward pass through model - outputs = self._model(**encoded) - - # Extract logits - if hasattr(outputs, "logits"): - logits = outputs.logits - elif isinstance(outputs, dict) and "logits" in outputs: - logits = outputs["logits"] - elif isinstance(outputs, (list, tuple)) and outputs: - logits = outputs[0] - else: - raise RuntimeError("Unexpected model output format") - - # Apply SPLADE activation - attention_mask = encoded["attention_mask"] - splade_repr = self._splade_activation(logits, attention_mask) - - # Max pooling over sequence length - splade_vec = self._max_pooling(splade_repr) - - # Convert to sparse dictionary (single item batch) - sparse_dict = self._to_sparse_dict(splade_vec[0]) - - return sparse_dict - - def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Dict[int, float]]: - """Batch encode texts to sparse vectors. - - Args: - texts: List of input texts to encode - batch_size: Batch size for encoding (default: 32) - - Returns: - List of sparse vectors as dictionaries - """ - if not texts: - return [] - - self._load_model() - - if self._model is None or self._tokenizer is None: - raise RuntimeError("Model not loaded") - - import numpy as np - - results: List[Dict[int, float]] = [] - - # Process in batches - for i in range(0, len(texts), batch_size): - batch_texts = texts[i:i + batch_size] - - # Tokenize batch - encoded = self._tokenizer( - batch_texts, - padding=True, - truncation=True, - max_length=self.max_length, - return_tensors="np", - ) - - # Forward pass through model - outputs = self._model(**encoded) - - # Extract logits - if hasattr(outputs, "logits"): - logits = outputs.logits - elif isinstance(outputs, dict) and "logits" in outputs: - logits = outputs["logits"] - elif isinstance(outputs, (list, tuple)) and outputs: - logits = outputs[0] - else: - raise RuntimeError("Unexpected model output format") - - # Apply SPLADE activation - attention_mask = encoded["attention_mask"] - splade_repr = self._splade_activation(logits, attention_mask) - - # Max pooling over sequence length - splade_vecs = self._max_pooling(splade_repr) - - # Convert each vector to sparse dictionary - for vec in splade_vecs: - sparse_dict = self._to_sparse_dict(vec) - results.append(sparse_dict) - - return results - - @property - def vocab_size(self) -> int: - """Return vocabulary size (~30k for BERT-based models). - - Returns: - Vocabulary size (number of tokens in tokenizer) - """ - if self._vocab_size is not None: - return self._vocab_size - - self._load_model() - return self._vocab_size or 0 - - def get_token(self, token_id: int) -> str: - """Convert token_id to string (for debugging). - - Args: - token_id: Token ID to convert - - Returns: - Token string - """ - self._load_model() - - if self._tokenizer is None: - raise RuntimeError("Tokenizer not loaded") - - return self._tokenizer.decode([token_id]) - - def get_top_tokens(self, sparse_vec: Dict[int, float], top_k: int = 10) -> List[Tuple[str, float]]: - """Get top-k tokens with highest weights from sparse vector. - - Useful for debugging and understanding what the model is focusing on. - - Args: - sparse_vec: Sparse vector as {token_id: weight} - top_k: Number of top tokens to return - - Returns: - List of (token_string, weight) tuples, sorted by weight descending - """ - self._load_model() - - if not sparse_vec: - return [] - - # Sort by weight descending - sorted_items = sorted(sparse_vec.items(), key=lambda x: x[1], reverse=True) - - # Take top-k and convert token_ids to strings - top_items = sorted_items[:top_k] - - return [ - (self.get_token(token_id), weight) - for token_id, weight in top_items - ] diff --git a/codex-lens/build/lib/codexlens/semantic/vector_store.py b/codex-lens/build/lib/codexlens/semantic/vector_store.py deleted file mode 100644 index 1dad8fbe..00000000 --- a/codex-lens/build/lib/codexlens/semantic/vector_store.py +++ /dev/null @@ -1,1278 +0,0 @@ -"""Vector storage and similarity search for semantic chunks. - -Optimized for high-performance similarity search using: -- HNSW index for O(log N) approximate nearest neighbor search (primary) -- Cached embedding matrix for batch operations (fallback) -- NumPy vectorized cosine similarity (fallback, 100x+ faster than loops) -- Lazy content loading (only fetch for top-k results) -""" - -from __future__ import annotations - -import json -import logging -import sys -import sqlite3 -import threading -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from codexlens.entities import SearchResult, SemanticChunk -from codexlens.errors import StorageError - -try: - import numpy as np - NUMPY_AVAILABLE = True -except ImportError: - np = None # type: ignore[assignment] - NUMPY_AVAILABLE = False - -# Try to import ANN index (optional hnswlib dependency) -try: - from codexlens.semantic.ann_index import ( - ANNIndex, - BinaryANNIndex, - create_ann_index, - HNSWLIB_AVAILABLE, - ) -except ImportError: - HNSWLIB_AVAILABLE = False - ANNIndex = None - BinaryANNIndex = None - create_ann_index = None - - -logger = logging.getLogger(__name__) - -# Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms). -EPSILON = 1e-10 - -# SQLite INTEGER PRIMARY KEY uses signed 64-bit rowids. -SQLITE_INTEGER_MAX = (1 << 63) - 1 - - -def _validate_chunk_id_range(start_id: int, count: int) -> None: - """Validate that a batch insert can safely generate sequential chunk IDs.""" - if count <= 0: - return - - last_id = start_id + count - 1 - if last_id > sys.maxsize or last_id > SQLITE_INTEGER_MAX: - raise ValueError( - "Chunk ID range overflow: " - f"start_id={start_id}, count={count} would allocate up to {last_id}, " - f"exceeding limits (sys.maxsize={sys.maxsize}, sqlite_max={SQLITE_INTEGER_MAX}). " - "Consider cleaning up the index database or creating a new index database." - ) - - -def _validate_sql_placeholders(placeholders: str, expected_count: int) -> None: - """Validate the placeholder string used for a parameterized SQL IN clause.""" - expected = ",".join("?" * expected_count) - if placeholders != expected: - raise ValueError( - "Invalid SQL placeholders for IN clause. " - f"Expected {expected_count} '?' placeholders." - ) - - -def _cosine_similarity(a: List[float], b: List[float]) -> float: - """Compute cosine similarity between two vectors.""" - if not NUMPY_AVAILABLE: - raise ImportError("numpy required for vector operations") - - a_arr = np.array(a) - b_arr = np.array(b) - - norm_a = np.linalg.norm(a_arr) - norm_b = np.linalg.norm(b_arr) - - # Use epsilon tolerance to avoid division by (near-)zero due to floating point precision. - if norm_a < EPSILON or norm_b < EPSILON: - return 0.0 - - denom = norm_a * norm_b - if denom < EPSILON: - return 0.0 - - return float(np.dot(a_arr, b_arr) / denom) - - -class VectorStore: - """SQLite-based vector storage with HNSW-accelerated similarity search. - - Performance optimizations: - - HNSW index for O(log N) approximate nearest neighbor search - - Embedding matrix cached in memory for batch similarity computation (fallback) - - NumPy vectorized operations instead of Python loops (fallback) - - Lazy content loading - only fetch full content for top-k results - - Thread-safe cache invalidation - - Bulk insert mode for efficient batch operations - """ - - # Default embedding dimension (used when creating new index) - DEFAULT_DIM = 768 - - def __init__(self, db_path: str | Path) -> None: - if not NUMPY_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self.db_path = Path(db_path) - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Embedding cache for fast similarity search (fallback) - self._cache_lock = threading.RLock() - self._embedding_matrix: Optional[np.ndarray] = None - self._embedding_norms: Optional[np.ndarray] = None - self._chunk_ids: Optional[List[int]] = None - self._cache_version: int = 0 - - # ANN index for O(log N) search - self._ann_index: Optional[ANNIndex] = None - self._ann_dim: Optional[int] = None - self._ann_write_lock = threading.Lock() # Protects ANN index modifications - - # Bulk insert mode tracking - self._bulk_insert_mode: bool = False - self._bulk_insert_ids: List[int] = [] - self._bulk_insert_embeddings: List[np.ndarray] = [] - - self._init_schema() - self._init_ann_index() - - def _init_schema(self) -> None: - """Initialize vector storage schema.""" - with sqlite3.connect(self.db_path) as conn: - # Enable memory mapping for faster reads - conn.execute("PRAGMA mmap_size = 30000000000") # 30GB limit - conn.execute(""" - CREATE TABLE IF NOT EXISTS semantic_chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB NOT NULL, - metadata TEXT, - category TEXT DEFAULT 'code', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_file - ON semantic_chunks(file_path) - """) - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_category - ON semantic_chunks(category) - """) - # Model configuration table - tracks which model generated the embeddings - conn.execute(""" - CREATE TABLE IF NOT EXISTS embeddings_config ( - id INTEGER PRIMARY KEY CHECK (id = 1), - model_profile TEXT NOT NULL, - model_name TEXT NOT NULL, - embedding_dim INTEGER NOT NULL, - backend TEXT NOT NULL DEFAULT 'fastembed', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - - # Migration: Add backend column to existing tables - self._migrate_backend_column(conn) - # Migration: Add category column - self._migrate_category_column(conn) - - conn.commit() - - def _migrate_backend_column(self, conn: sqlite3.Connection) -> None: - """Add backend column to existing embeddings_config table if not present. - - Args: - conn: Active SQLite connection - """ - # Check if backend column exists - cursor = conn.execute("PRAGMA table_info(embeddings_config)") - columns = [row[1] for row in cursor.fetchall()] - - if 'backend' not in columns: - logger.info("Migrating embeddings_config table: adding backend column") - conn.execute(""" - ALTER TABLE embeddings_config - ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed' - """) - - def _migrate_category_column(self, conn: sqlite3.Connection) -> None: - """Add category column to existing semantic_chunks table if not present. - - Args: - conn: Active SQLite connection - """ - # Check if category column exists - cursor = conn.execute("PRAGMA table_info(semantic_chunks)") - columns = [row[1] for row in cursor.fetchall()] - - if 'category' not in columns: - logger.info("Migrating semantic_chunks table: adding category column") - conn.execute(""" - ALTER TABLE semantic_chunks - ADD COLUMN category TEXT DEFAULT 'code' - """) - # Create index for fast category filtering - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_category - ON semantic_chunks(category) - """) - - def _init_ann_index(self) -> None: - """Initialize ANN index (lazy loading from existing data).""" - if not HNSWLIB_AVAILABLE: - logger.debug("hnswlib not available, using brute-force search") - return - - # Try to detect embedding dimension from existing data - dim = self._detect_embedding_dim() - if dim is None: - # No data yet, will initialize on first add - logger.debug("No embeddings found, ANN index will be created on first add") - return - - self._ann_dim = dim - - try: - self._ann_index = ANNIndex(self.db_path, dim) - if self._ann_index.load(): - logger.debug( - "Loaded ANN index with %d vectors", self._ann_index.count() - ) - else: - # Index file doesn't exist, try to build from SQLite data - logger.debug("ANN index file not found, rebuilding from SQLite") - self._rebuild_ann_index_internal() - except Exception as e: - logger.warning("Failed to initialize ANN index: %s", e) - self._ann_index = None - - def _detect_embedding_dim(self) -> Optional[int]: - """Detect embedding dimension from existing data.""" - with sqlite3.connect(self.db_path) as conn: - row = conn.execute( - "SELECT embedding FROM semantic_chunks LIMIT 1" - ).fetchone() - if row and row[0]: - # Embedding is stored as float32 blob - blob = row[0] - return len(blob) // np.dtype(np.float32).itemsize - return None - - @property - def dimension(self) -> Optional[int]: - """Return the dimension of embeddings in the store. - - Returns: - Embedding dimension if available, None if store is empty. - """ - if self._ann_dim is not None: - return self._ann_dim - self._ann_dim = self._detect_embedding_dim() - return self._ann_dim - - def _rebuild_ann_index_internal(self) -> int: - """Internal method to rebuild ANN index from SQLite data.""" - if self._ann_index is None: - return 0 - - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute( - "SELECT id, embedding FROM semantic_chunks" - ).fetchall() - - if not rows: - return 0 - - # Extract IDs and embeddings - ids = [r[0] for r in rows] - embeddings = np.vstack([ - np.frombuffer(r[1], dtype=np.float32) for r in rows - ]) - - # Add to ANN index - self._ann_index.add_vectors(ids, embeddings) - self._ann_index.save() - - logger.info("Rebuilt ANN index with %d vectors", len(ids)) - return len(ids) - - def rebuild_ann_index(self) -> int: - """Rebuild HNSW index from all chunks in SQLite. - - Use this method to: - - Migrate existing data to use ANN search - - Repair corrupted index - - Reclaim space after many deletions - - Returns: - Number of vectors indexed. - """ - if not HNSWLIB_AVAILABLE: - logger.warning("hnswlib not available, cannot rebuild ANN index") - return 0 - - # Detect dimension - dim = self._detect_embedding_dim() - if dim is None: - logger.warning("No embeddings found, cannot rebuild ANN index") - return 0 - - self._ann_dim = dim - - # Create new index - try: - self._ann_index = ANNIndex(self.db_path, dim) - return self._rebuild_ann_index_internal() - except Exception as e: - logger.error("Failed to rebuild ANN index: %s", e) - self._ann_index = None - return 0 - - def _invalidate_cache(self) -> None: - """Invalidate the embedding cache (thread-safe).""" - with self._cache_lock: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - self._cache_version += 1 - - def _refresh_cache(self) -> bool: - """Load embeddings into numpy matrix for fast similarity search. - - Returns: - True if cache was refreshed successfully, False if no data. - """ - with self._cache_lock: - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute( - "SELECT id, embedding FROM semantic_chunks" - ).fetchall() - - if not rows: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - return False - - # Extract IDs and embeddings - self._chunk_ids = [r[0] for r in rows] - - # Bulk convert binary blobs to numpy matrix - embeddings = [ - np.frombuffer(r[1], dtype=np.float32) for r in rows - ] - self._embedding_matrix = np.vstack(embeddings) - - # Pre-compute norms for faster similarity calculation - self._embedding_norms = np.linalg.norm( - self._embedding_matrix, axis=1, keepdims=True - ) - # Avoid division by zero - self._embedding_norms = np.where( - self._embedding_norms == 0, EPSILON, self._embedding_norms - ) - - return True - - def _ensure_ann_index(self, dim: int) -> bool: - """Ensure ANN index is initialized with correct dimension. - - This method is thread-safe and uses double-checked locking. - - Args: - dim: Embedding dimension - - Returns: - True if ANN index is ready, False otherwise - """ - if not HNSWLIB_AVAILABLE: - return False - - # Fast path: index already initialized (no lock needed) - if self._ann_index is not None: - return True - - # Slow path: acquire lock for initialization - with self._ann_write_lock: - # Double-check after acquiring lock - if self._ann_index is not None: - return True - - try: - self._ann_dim = dim - self._ann_index = ANNIndex(self.db_path, dim) - self._ann_index.load() # Try to load existing - return True - except Exception as e: - logger.warning("Failed to initialize ANN index: %s", e) - self._ann_index = None - return False - - def add_chunk( - self, chunk: SemanticChunk, file_path: str, category: str = "code" - ) -> int: - """Add a single chunk with its embedding. - - Args: - chunk: SemanticChunk with embedding - file_path: Path to the source file - category: File category ('code' or 'doc'), default 'code' - - Returns: - The inserted chunk ID. - """ - if chunk.embedding is None: - raise ValueError("Chunk must have embedding before adding to store") - - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - (file_path, chunk.content, embedding_blob, metadata_json, category) - ) - conn.commit() - chunk_id = cursor.lastrowid or 0 - - # Add to ANN index - if self._ensure_ann_index(len(chunk.embedding)): - with self._ann_write_lock: - try: - self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1)) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return chunk_id - - def add_chunks( - self, chunks: List[SemanticChunk], file_path: str, category: str = "code" - ) -> List[int]: - """Add multiple chunks with embeddings (batch insert). - - Args: - chunks: List of SemanticChunk objects with embeddings - file_path: Path to the source file - category: File category ('code' or 'doc'), default 'code' - - Returns: - List of inserted chunk IDs. - """ - if not chunks: - return [] - - # Prepare batch data - batch_data = [] - embeddings_list = [] - for chunk in chunks: - if chunk.embedding is None: - raise ValueError("All chunks must have embeddings") - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - embeddings_list.append(embedding_arr) - - # Batch insert to SQLite - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + len(chunks))) - - # Add to ANN index - if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])): - with self._ann_write_lock: - try: - embeddings_matrix = np.vstack(embeddings_list) - self._ann_index.add_vectors(ids, embeddings_matrix) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def add_chunks_batch( - self, - chunks_with_paths: List[Tuple[SemanticChunk, str]], - update_ann: bool = True, - auto_save_ann: bool = True, - categories: Optional[List[str]] = None, - ) -> List[int]: - """Batch insert chunks from multiple files in a single transaction. - - This method is optimized for bulk operations during index generation. - - Args: - chunks_with_paths: List of (chunk, file_path) tuples - update_ann: If True, update ANN index with new vectors (default: True) - auto_save_ann: If True, save ANN index after update (default: True). - Set to False for bulk inserts to reduce I/O overhead. - categories: Optional list of categories per chunk. If None, defaults to 'code'. - If provided, must match length of chunks_with_paths. - - Returns: - List of inserted chunk IDs - """ - if not chunks_with_paths: - return [] - - batch_size = len(chunks_with_paths) - - # Validate categories if provided - if categories is not None and len(categories) != batch_size: - raise ValueError( - f"categories length ({len(categories)}) must match " - f"chunks_with_paths length ({batch_size})" - ) - - # Prepare batch data - batch_data = [] - embeddings_list = [] - for i, (chunk, file_path) in enumerate(chunks_with_paths): - if chunk.embedding is None: - raise ValueError("All chunks must have embeddings") - # Optimize: avoid repeated np.array() if already numpy - if isinstance(chunk.embedding, np.ndarray): - embedding_arr = chunk.embedding.astype(np.float32) - else: - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - category = categories[i] if categories else "code" - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - embeddings_list.append(embedding_arr) - - # Batch insert to SQLite in single transaction - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - _validate_chunk_id_range(start_id, batch_size) - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + batch_size)) - - # Handle ANN index updates - if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])): - with self._ann_write_lock: - # In bulk insert mode, accumulate for later batch update - if self._bulk_insert_mode: - self._bulk_insert_ids.extend(ids) - self._bulk_insert_embeddings.extend(embeddings_list) - else: - # Normal mode: update immediately - try: - embeddings_matrix = np.vstack(embeddings_list) - self._ann_index.add_vectors(ids, embeddings_matrix) - if auto_save_ann: - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def add_chunks_batch_numpy( - self, - chunks_with_paths: List[Tuple[SemanticChunk, str]], - embeddings_matrix: np.ndarray, - update_ann: bool = True, - auto_save_ann: bool = True, - categories: Optional[List[str]] = None, - ) -> List[int]: - """Batch insert chunks with pre-computed numpy embeddings matrix. - - This method accepts embeddings as a numpy matrix to avoid list->array conversions. - Useful when embeddings are already in numpy format from batch encoding. - - Args: - chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None) - embeddings_matrix: Pre-computed embeddings as (N, D) numpy array - update_ann: If True, update ANN index with new vectors (default: True) - auto_save_ann: If True, save ANN index after update (default: True) - categories: Optional list of categories per chunk. If None, defaults to 'code'. - - Returns: - List of inserted chunk IDs - """ - if not chunks_with_paths: - return [] - - batch_size = len(chunks_with_paths) - - if len(chunks_with_paths) != embeddings_matrix.shape[0]: - raise ValueError( - f"Mismatch: {len(chunks_with_paths)} chunks but " - f"{embeddings_matrix.shape[0]} embeddings" - ) - - # Validate categories if provided - if categories is not None and len(categories) != batch_size: - raise ValueError( - f"categories length ({len(categories)}) must match " - f"chunks_with_paths length ({batch_size})" - ) - - # Ensure float32 format - embeddings_matrix = embeddings_matrix.astype(np.float32) - - # Prepare batch data - batch_data = [] - for i, (chunk, file_path) in enumerate(chunks_with_paths): - embedding_arr = embeddings_matrix[i] - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - category = categories[i] if categories else "code" - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - - # Batch insert to SQLite in single transaction - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - _validate_chunk_id_range(start_id, batch_size) - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + batch_size)) - - # Handle ANN index updates - if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]): - with self._ann_write_lock: - # In bulk insert mode, accumulate for later batch update - if self._bulk_insert_mode: - self._bulk_insert_ids.extend(ids) - # Split matrix into individual arrays for accumulation - self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))]) - else: - # Normal mode: update immediately - try: - self._ann_index.add_vectors(ids, embeddings_matrix) - if auto_save_ann: - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def begin_bulk_insert(self) -> None: - """Begin bulk insert mode - disable ANN auto-update for better performance. - - Usage: - store.begin_bulk_insert() - try: - for batch in batches: - store.add_chunks_batch(batch, auto_save_ann=False) - finally: - store.end_bulk_insert() - - Or use context manager: - with store.bulk_insert(): - for batch in batches: - store.add_chunks_batch(batch) - """ - with self._ann_write_lock: - self._bulk_insert_mode = True - self._bulk_insert_ids.clear() - self._bulk_insert_embeddings.clear() - logger.debug("Entered bulk insert mode") - - def end_bulk_insert(self) -> None: - """End bulk insert mode and rebuild ANN index from accumulated data. - - This method should be called after all bulk inserts are complete to - update the ANN index in a single batch operation. - """ - with self._ann_write_lock: - if not self._bulk_insert_mode: - logger.warning("end_bulk_insert called but not in bulk insert mode") - return - - self._bulk_insert_mode = False - bulk_ids = list(self._bulk_insert_ids) - bulk_embeddings = list(self._bulk_insert_embeddings) - self._bulk_insert_ids.clear() - self._bulk_insert_embeddings.clear() - - # Update ANN index with accumulated data. - if bulk_ids and bulk_embeddings: - if self._ensure_ann_index(len(bulk_embeddings[0])): - with self._ann_write_lock: - try: - embeddings_matrix = np.vstack(bulk_embeddings) - self._ann_index.add_vectors(bulk_ids, embeddings_matrix) - self._ann_index.save() - logger.info( - "Bulk insert complete: added %d vectors to ANN index", - len(bulk_ids), - ) - except Exception as e: - logger.error("Failed to update ANN index after bulk insert: %s", e) - - logger.debug("Exited bulk insert mode") - - class BulkInsertContext: - """Context manager for bulk insert operations.""" - - def __init__(self, store: "VectorStore") -> None: - self.store = store - - def __enter__(self) -> "VectorStore": - self.store.begin_bulk_insert() - return self.store - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.store.end_bulk_insert() - - def bulk_insert(self) -> "VectorStore.BulkInsertContext": - """Return a context manager for bulk insert operations. - - Usage: - with store.bulk_insert(): - for batch in batches: - store.add_chunks_batch(batch) - """ - return self.BulkInsertContext(self) - - def delete_file_chunks(self, file_path: str) -> int: - """Delete all chunks for a file. - - Returns: - Number of deleted chunks. - """ - # Get chunk IDs before deletion (for ANN index) - chunk_ids_to_delete = [] - if self._ann_index is not None: - with sqlite3.connect(self.db_path) as conn: - rows = conn.execute( - "SELECT id FROM semantic_chunks WHERE file_path = ?", - (file_path,) - ).fetchall() - chunk_ids_to_delete = [r[0] for r in rows] - - # Delete from SQLite - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute( - "DELETE FROM semantic_chunks WHERE file_path = ?", - (file_path,) - ) - conn.commit() - deleted = cursor.rowcount - - # Remove from ANN index - if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete: - with self._ann_write_lock: - try: - self._ann_index.remove_vectors(chunk_ids_to_delete) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to remove from ANN index: %s", e) - - if deleted > 0: - self._invalidate_cache() - return deleted - - def search_similar( - self, - query_embedding: List[float], - top_k: int = 10, - min_score: float = 0.0, - return_full_content: bool = True, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Find chunks most similar to query embedding. - - Uses HNSW index for O(log N) search when available, falls back to - brute-force NumPy search otherwise. - - Args: - query_embedding: Query vector. - top_k: Maximum results to return. - min_score: Minimum cosine similarity score in [0.0, 1.0]. - return_full_content: If True, return full code block content. - category: Optional category filter ('code' or 'doc'). If None, returns all. - - Returns: - List of SearchResult ordered by similarity (highest first). - """ - query_vec = np.array(query_embedding, dtype=np.float32) - - if not 0.0 <= min_score <= 1.0: - raise ValueError( - f"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity." - ) - - # Try HNSW search first (O(log N)) - if ( - HNSWLIB_AVAILABLE - and self._ann_index is not None - and self._ann_index.is_loaded - and self._ann_index.count() > 0 - ): - try: - return self._search_with_ann( - query_vec, top_k, min_score, return_full_content, category - ) - except Exception as e: - logger.warning("ANN search failed, falling back to brute-force: %s", e) - - # Fallback to brute-force search (O(N)) - return self._search_brute_force( - query_vec, top_k, min_score, return_full_content, category - ) - - def _search_with_ann( - self, - query_vec: np.ndarray, - top_k: int, - min_score: float, - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Search using HNSW index (O(log N)). - - Args: - query_vec: Query vector as numpy array - top_k: Maximum results to return - min_score: Minimum cosine similarity score in [0.0, 1.0] - return_full_content: If True, return full code block content - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult ordered by similarity (highest first) - """ - # Limit top_k to available vectors to prevent hnswlib error - ann_count = self._ann_index.count() - # When category filtering, fetch more candidates to compensate for filtering - fetch_k = top_k * 3 if category else top_k - effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0 - - if effective_top_k == 0: - return [] - - # HNSW search returns (ids, distances) - # For cosine space: distance = 1 - similarity - ids, distances = self._ann_index.search(query_vec, effective_top_k) - - if ids is None or distances is None: - logger.debug( - "ANN search returned null results (ids=%s, distances=%s)", - ids, - distances, - ) - return [] - - if len(ids) == 0 or len(distances) == 0: - logger.debug( - "ANN search returned empty results (ids=%s, distances=%s)", - ids, - distances, - ) - return [] - - if len(ids) != len(distances): - logger.warning( - "ANN search returned mismatched result lengths (%d ids, %d distances)", - len(ids), - len(distances), - ) - return [] - - # Convert distances to similarity scores - scores = [1.0 - d for d in distances] - - # Filter by min_score - filtered = [ - (chunk_id, score) - for chunk_id, score in zip(ids, scores) - if score >= min_score - ] - - if not filtered: - return [] - - top_ids = [f[0] for f in filtered] - top_scores = [f[1] for f in filtered] - - # Fetch content from SQLite with category filtering - results = self._fetch_results_by_ids( - top_ids, top_scores, return_full_content, category - ) - # Apply final limit after category filtering - return results[:top_k] - - def _search_brute_force( - self, - query_vec: np.ndarray, - top_k: int, - min_score: float, - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Brute-force search using NumPy (O(N) fallback). - - Args: - query_vec: Query vector as numpy array - top_k: Maximum results to return - min_score: Minimum cosine similarity score in [0.0, 1.0] - return_full_content: If True, return full code block content - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult ordered by similarity (highest first) - """ - logger.warning( - "Using brute-force vector search (hnswlib not available). " - "This may cause high memory usage for large indexes. " - "Install hnswlib for better performance: pip install hnswlib" - ) - - with self._cache_lock: - # Refresh cache if needed - if self._embedding_matrix is None: - if not self._refresh_cache(): - return [] # No data - - # Vectorized cosine similarity - query_vec = query_vec.reshape(1, -1) - query_norm = np.linalg.norm(query_vec) - if query_norm == 0: - return [] - - # Compute all similarities at once: (N,) scores - # similarity = (A @ B.T) / (||A|| * ||B||) - dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten() - scores = dot_products / (self._embedding_norms.flatten() * query_norm) - - # Filter by min_score and get top-k indices - valid_mask = scores >= min_score - valid_indices = np.where(valid_mask)[0] - - if len(valid_indices) == 0: - return [] - - # When category filtering, fetch more candidates to compensate for filtering - fetch_k = top_k * 3 if category else top_k - - # Sort by score descending and take top candidates - valid_scores = scores[valid_indices] - sorted_order = np.argsort(valid_scores)[::-1][:fetch_k] - top_indices = valid_indices[sorted_order] - top_scores = valid_scores[sorted_order] - - # Get chunk IDs for top results - top_ids = [self._chunk_ids[i] for i in top_indices] - - # Fetch content only for top-k results (lazy loading) with category filtering - results = self._fetch_results_by_ids( - top_ids, top_scores.tolist(), return_full_content, category - ) - # Apply final limit after category filtering - return results[:top_k] - - def _fetch_results_by_ids( - self, - chunk_ids: List[int], - scores: List[float], - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch full result data for specific chunk IDs. - - Args: - chunk_ids: List of chunk IDs to fetch. - scores: Corresponding similarity scores. - return_full_content: Whether to include full content. - category: Optional category filter ('code' or 'doc'). - - Returns: - List of SearchResult objects. - """ - if not chunk_ids: - return [] - - # Build parameterized query for IN clause - placeholders = ",".join("?" * len(chunk_ids)) - _validate_sql_placeholders(placeholders, len(chunk_ids)) - - # SQL injection prevention: - # - Only a validated placeholders string (commas + '?') is interpolated into the query. - # - User-provided values are passed separately via sqlite3 parameters. - # - Category filter is added as a separate parameter - if category: - query = """ - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) AND category = ? - """.format(placeholders=placeholders) - params = list(chunk_ids) + [category] - else: - query = """ - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) - """.format(placeholders=placeholders) - params = chunk_ids - - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute(query, params).fetchall() - - # Build ID -> row mapping - id_to_row = {r[0]: r for r in rows} - - results = [] - for chunk_id, score in zip(chunk_ids, scores): - row = id_to_row.get(chunk_id) - if not row: - continue - - _, file_path, content, metadata_json = row - metadata = json.loads(metadata_json) if metadata_json else {} - - # Build excerpt (short preview) - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information from metadata - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - start_line = metadata.get("start_line") - end_line = metadata.get("end_line") - - # Build Symbol object if we have symbol info - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content if return_full_content else None, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - return results - - def count_chunks(self) -> int: - """Count total chunks in store.""" - with sqlite3.connect(self.db_path) as conn: - row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone() - return row[0] if row else 0 - - def get_all_chunks(self) -> List[SemanticChunk]: - """Get all chunks from the store. - - Returns: - List of SemanticChunk objects with id and content. - """ - with sqlite3.connect(self.db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute( - "SELECT id, file_path, content, metadata FROM semantic_chunks" - ).fetchall() - - chunks = [] - for row in rows: - chunks.append(SemanticChunk( - id=row["id"], - content=row["content"], - file_path=row["file_path"], - metadata=json.loads(row["metadata"]) if row["metadata"] else None, - )) - return chunks - - def clear_cache(self) -> None: - """Manually clear the embedding cache.""" - self._invalidate_cache() - - @property - def ann_available(self) -> bool: - """Check if ANN index is available and ready.""" - return ( - HNSWLIB_AVAILABLE - and self._ann_index is not None - and self._ann_index.is_loaded - ) - - @property - def ann_count(self) -> int: - """Get number of vectors in ANN index.""" - if self._ann_index is not None: - return self._ann_index.count() - return 0 - - def get_model_config(self) -> Optional[Dict[str, Any]]: - """Get the model configuration used for embeddings in this store. - - Returns: - Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set. - """ - with sqlite3.connect(self.db_path) as conn: - row = conn.execute( - "SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at " - "FROM embeddings_config WHERE id = 1" - ).fetchone() - if row: - return { - "model_profile": row[0], - "model_name": row[1], - "embedding_dim": row[2], - "backend": row[3], - "created_at": row[4], - "updated_at": row[5], - } - return None - - def set_model_config( - self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed' - ) -> None: - """Set the model configuration for embeddings in this store. - - This should be called when generating new embeddings. If a different - model was previously used, this will update the configuration. - - Args: - model_profile: Model profile name (fast, code, minilm, etc.) - model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code) - embedding_dim: Embedding dimension (e.g., 768) - backend: Backend used for embeddings (fastembed or litellm, default: fastembed) - """ - with sqlite3.connect(self.db_path) as conn: - conn.execute( - """ - INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend) - VALUES (1, ?, ?, ?, ?) - ON CONFLICT(id) DO UPDATE SET - model_profile = excluded.model_profile, - model_name = excluded.model_name, - embedding_dim = excluded.embedding_dim, - backend = excluded.backend, - updated_at = CURRENT_TIMESTAMP - """, - (model_profile, model_name, embedding_dim, backend) - ) - conn.commit() - - def check_model_compatibility( - self, model_profile: str, model_name: str, embedding_dim: int - ) -> Tuple[bool, Optional[str]]: - """Check if the given model is compatible with existing embeddings. - - Args: - model_profile: Model profile to check - model_name: Model name to check - embedding_dim: Embedding dimension to check - - Returns: - Tuple of (is_compatible, warning_message). - is_compatible is True if no existing config or configs match. - warning_message is a user-friendly message if incompatible. - """ - existing = self.get_model_config() - if existing is None: - return True, None - - # Check dimension first (most critical) - if existing["embedding_dim"] != embedding_dim: - return False, ( - f"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d " - f"({existing['model_profile']}), but requested model uses {embedding_dim}d " - f"({model_profile}). Use --force to regenerate all embeddings." - ) - - # Check model (different models with same dimension may have different semantic spaces) - if existing["model_profile"] != model_profile: - return False, ( - f"Model mismatch: existing embeddings use '{existing['model_profile']}' " - f"({existing['model_name']}), but requested '{model_profile}' " - f"({model_name}). Use --force to regenerate all embeddings." - ) - - return True, None - - def close(self) -> None: - """Close the vector store and release resources. - - This ensures SQLite connections are closed and ANN index is cleared, - allowing temporary files to be deleted on Windows. - """ - with self._cache_lock: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - - with self._ann_write_lock: - self._ann_index = None - - def __enter__(self) -> "VectorStore": - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - """Context manager exit - close resources.""" - self.close() diff --git a/codex-lens/build/lib/codexlens/storage/__init__.py b/codex-lens/build/lib/codexlens/storage/__init__.py deleted file mode 100644 index 815bc961..00000000 --- a/codex-lens/build/lib/codexlens/storage/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Storage backends for CodexLens.""" - -from __future__ import annotations - -from .sqlite_store import SQLiteStore -from .path_mapper import PathMapper -from .registry import RegistryStore, ProjectInfo, DirMapping -from .dir_index import DirIndexStore, SubdirLink, FileEntry -from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult -from .vector_meta_store import VectorMetadataStore - -__all__ = [ - # Legacy (workspace-local) - "SQLiteStore", - # Path mapping - "PathMapper", - # Global registry - "RegistryStore", - "ProjectInfo", - "DirMapping", - # Directory index - "DirIndexStore", - "SubdirLink", - "FileEntry", - # Tree builder - "IndexTreeBuilder", - "BuildResult", - "DirBuildResult", - # Vector metadata - "VectorMetadataStore", -] - diff --git a/codex-lens/build/lib/codexlens/storage/dir_index.py b/codex-lens/build/lib/codexlens/storage/dir_index.py deleted file mode 100644 index ee9e11c5..00000000 --- a/codex-lens/build/lib/codexlens/storage/dir_index.py +++ /dev/null @@ -1,2358 +0,0 @@ -"""Single-directory index storage with hierarchical linking. - -Each directory maintains its own _index.db with: -- Files in the current directory -- Links to subdirectory indexes -- Full-text search via FTS5 -- Symbol table for code navigation -""" - -from __future__ import annotations - -import logging -import hashlib -import re -import sqlite3 -import threading -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, SearchResult, Symbol -from codexlens.errors import StorageError -from codexlens.storage.global_index import GlobalSymbolIndex - - -@dataclass -class SubdirLink: - """Link to a subdirectory's index database.""" - - id: int - name: str - index_path: Path - files_count: int - last_updated: float - - -@dataclass -class FileEntry: - """Metadata for an indexed file in current directory.""" - - id: int - name: str - full_path: Path - language: str - mtime: float - line_count: int - - -class DirIndexStore: - """Single-directory index storage with hierarchical subdirectory linking. - - Each directory has an independent _index.db containing: - - Files table: Files in this directory only - - Subdirs table: Links to child directory indexes - - Symbols table: Code symbols from files - - FTS5 index: Full-text search on file content - - Thread-safe operations with WAL mode enabled. - """ - - # Schema version for migration tracking - # Increment this when schema changes require migration - SCHEMA_VERSION = 8 - - def __init__( - self, - db_path: str | Path, - *, - config: Config | None = None, - global_index: GlobalSymbolIndex | None = None, - ) -> None: - """Initialize directory index store. - - Args: - db_path: Path to _index.db file for this directory - """ - self.db_path = Path(db_path).resolve() - self._lock = threading.RLock() - self._conn: Optional[sqlite3.Connection] = None - self.logger = logging.getLogger(__name__) - self._config = config - self._global_index = global_index - - def initialize(self) -> None: - """Create database and schema if not exists.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - - # Check current schema version - current_version = self._get_schema_version(conn) - - # Fail gracefully if database is from a newer version - if current_version > self.SCHEMA_VERSION: - raise StorageError( - f"Database schema version {current_version} is newer than " - f"supported version {self.SCHEMA_VERSION}. " - f"Please update the application or use a compatible database.", - db_path=str(self.db_path), - operation="initialize", - details={ - "current_version": current_version, - "supported_version": self.SCHEMA_VERSION - } - ) - - # Create or migrate schema - if current_version == 0: - # New database - create schema directly - self._create_schema(conn) - self._create_fts_triggers(conn) - self._set_schema_version(conn, self.SCHEMA_VERSION) - elif current_version < self.SCHEMA_VERSION: - # Existing database - apply migrations - self._apply_migrations(conn, current_version) - self._set_schema_version(conn, self.SCHEMA_VERSION) - - conn.commit() - - def _get_schema_version(self, conn: sqlite3.Connection) -> int: - """Get current schema version from database.""" - try: - row = conn.execute("PRAGMA user_version").fetchone() - return row[0] if row else 0 - except Exception: - return 0 - - def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None: - """Set schema version in database.""" - conn.execute(f"PRAGMA user_version = {version}") - - def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None: - """Apply schema migrations from current version to latest. - - Args: - conn: Database connection - from_version: Current schema version - """ - # Migration v0/v1 -> v2: Add 'name' column to files table - if from_version < 2: - self._migrate_v2_add_name_column(conn) - - # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy) - if from_version < 4: - from codexlens.storage.migrations.migration_004_dual_fts import upgrade - upgrade(conn) - - # Migration v4 -> v5: Remove unused/redundant fields - if from_version < 5: - from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade - upgrade(conn) - - # Migration v5 -> v6: Ensure relationship tables/indexes exist - if from_version < 6: - from codexlens.storage.migrations.migration_006_enhance_relationships import upgrade - upgrade(conn) - - # Migration v6 -> v7: Add graph neighbor cache for search expansion - if from_version < 7: - from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade - upgrade(conn) - - # Migration v7 -> v8: Add Merkle hashes for incremental change detection - if from_version < 8: - from codexlens.storage.migrations.migration_008_add_merkle_hashes import upgrade - upgrade(conn) - - def close(self) -> None: - """Close database connection.""" - with self._lock: - if self._conn is not None: - try: - self._conn.close() - except Exception: - pass - finally: - self._conn = None - - def __enter__(self) -> DirIndexStore: - """Context manager entry.""" - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - """Context manager exit.""" - self.close() - - # === File Operations === - - def add_file( - self, - name: str, - full_path: str | Path, - content: str, - language: str, - symbols: Optional[List[Symbol]] = None, - relationships: Optional[List[CodeRelationship]] = None, - ) -> int: - """Add or update a file in the current directory index. - - Args: - name: Filename without path - full_path: Complete source file path - content: File content for indexing - language: Programming language identifier - symbols: List of Symbol objects from the file - relationships: Optional list of CodeRelationship edges from this file - - Returns: - Database file_id - - Raises: - StorageError: If database operations fail - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None - line_count = content.count('\n') + 1 - - try: - conn.execute( - """ - INSERT INTO files(name, full_path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(full_path) DO UPDATE SET - name=excluded.name, - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (name, full_path_str, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - raise StorageError(f"Failed to retrieve file_id for {full_path_str}") - - file_id = int(row["id"]) - - # Replace symbols - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if symbols: - # Insert symbols without token_count and symbol_type - symbol_rows = [] - for s in symbols: - symbol_rows.append( - (file_id, s.name, s.kind, s.range[0], s.range[1]) - ) - - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - symbol_rows, - ) - - self._save_merkle_hash(conn, file_id=file_id, content=content) - self._save_relationships(conn, file_id=file_id, relationships=relationships) - conn.commit() - self._maybe_update_global_symbols(full_path_str, symbols or []) - return file_id - - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError(f"Failed to add file {name}: {exc}") from exc - - def save_relationships(self, file_id: int, relationships: List[CodeRelationship]) -> None: - """Save relationships for an already-indexed file. - - Args: - file_id: Database file id - relationships: Relationship edges to persist - """ - if not relationships: - return - with self._lock: - conn = self._get_connection() - self._save_relationships(conn, file_id=file_id, relationships=relationships) - conn.commit() - - def _save_relationships( - self, - conn: sqlite3.Connection, - file_id: int, - relationships: Optional[List[CodeRelationship]], - ) -> None: - if not relationships: - return - - rows = conn.execute( - "SELECT id, name FROM symbols WHERE file_id=? ORDER BY start_line, id", - (file_id,), - ).fetchall() - - name_to_id: Dict[str, int] = {} - for row in rows: - name = row["name"] - if name not in name_to_id: - name_to_id[name] = int(row["id"]) - - if not name_to_id: - return - - rel_rows: List[Tuple[int, str, str, int, Optional[str]]] = [] - seen: set[tuple[int, str, str, int, Optional[str]]] = set() - - for rel in relationships: - source_id = name_to_id.get(rel.source_symbol) - if source_id is None: - continue - - target = (rel.target_symbol or "").strip() - if not target: - continue - - rel_type = rel.relationship_type.value - source_line = int(rel.source_line) - key = (source_id, target, rel_type, source_line, rel.target_file) - if key in seen: - continue - seen.add(key) - - rel_rows.append((source_id, target, rel_type, source_line, rel.target_file)) - - if not rel_rows: - return - - conn.executemany( - """ - INSERT INTO code_relationships( - source_symbol_id, target_qualified_name, - relationship_type, source_line, target_file - ) - VALUES(?, ?, ?, ?, ?) - """, - rel_rows, - ) - - def _save_merkle_hash(self, conn: sqlite3.Connection, file_id: int, content: str) -> None: - """Upsert a SHA-256 content hash for the given file_id (best-effort).""" - try: - digest = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest() - now = time.time() - conn.execute( - """ - INSERT INTO merkle_hashes(file_id, sha256, updated_at) - VALUES(?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - sha256=excluded.sha256, - updated_at=excluded.updated_at - """, - (file_id, digest, now), - ) - except sqlite3.Error: - return - - def add_files_batch( - self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]] - ) -> int: - """Add multiple files in a single transaction. - - Args: - files: List of (name, full_path, content, language, symbols) tuples - - Returns: - Number of files added - - Raises: - StorageError: If batch operation fails - """ - with self._lock: - conn = self._get_connection() - count = 0 - - try: - conn.execute("BEGIN") - - for name, full_path, content, language, symbols in files: - full_path_str = str(Path(full_path).resolve()) - mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None - line_count = content.count('\n') + 1 - - conn.execute( - """ - INSERT INTO files(name, full_path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(full_path) DO UPDATE SET - name=excluded.name, - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (name, full_path_str, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - raise StorageError(f"Failed to retrieve file_id for {full_path_str}") - - file_id = int(row["id"]) - count += 1 - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if symbols: - # Insert symbols - symbol_rows = [] - for s in symbols: - symbol_rows.append( - (file_id, s.name, s.kind, s.range[0], s.range[1]) - ) - - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - symbol_rows, - ) - - self._save_merkle_hash(conn, file_id=file_id, content=content) - - conn.commit() - return count - - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError(f"Batch insert failed: {exc}") from exc - - def remove_file(self, full_path: str | Path) -> bool: - """Remove a file from the index. - - Args: - full_path: Complete source file path - - Returns: - True if file was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - return False - - file_id = int(row["id"]) - conn.execute("DELETE FROM files WHERE id=?", (file_id,)) - conn.commit() - self._maybe_delete_global_symbols(full_path_str) - return True - - def get_file(self, full_path: str | Path) -> Optional[FileEntry]: - """Get file metadata. - - Args: - full_path: Complete source file path - - Returns: - FileEntry if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute( - """ - SELECT id, name, full_path, language, mtime, line_count - FROM files WHERE full_path=? - """, - (full_path_str,), - ).fetchone() - - if not row: - return None - - return FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - - def get_file_mtime(self, full_path: str | Path) -> Optional[float]: - """Get stored modification time for a file. - - Args: - full_path: Complete source file path - - Returns: - Modification time as float, or None if not found - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute( - "SELECT mtime FROM files WHERE full_path=?", (full_path_str,) - ).fetchone() - - return float(row["mtime"]) if row and row["mtime"] else None - - def needs_reindex(self, full_path: str | Path) -> bool: - """Check if a file needs reindexing. - - Default behavior uses mtime comparison (with 1ms tolerance). - - When `Config.enable_merkle_detection` is enabled and Merkle metadata is - available, uses SHA-256 content hash comparison (with mtime as a fast - path to avoid hashing unchanged files). - - Args: - full_path: Complete source file path - - Returns: - True if file should be reindexed (new, modified, or missing from index) - """ - full_path_obj = Path(full_path).resolve() - if not full_path_obj.exists(): - return False # File doesn't exist, skip indexing - - # Get current filesystem mtime - try: - current_mtime = full_path_obj.stat().st_mtime - except OSError: - return False # Can't read file stats, skip - - MTIME_TOLERANCE = 0.001 - - # Fast path: mtime-only mode (default / backward-compatible) - if self._config is None or not getattr(self._config, "enable_merkle_detection", False): - stored_mtime = self.get_file_mtime(full_path_obj) - if stored_mtime is None: - return True - return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE - - full_path_str = str(full_path_obj) - - # Hash-based change detection (best-effort, falls back to mtime when metadata missing) - with self._lock: - conn = self._get_connection() - try: - row = conn.execute( - """ - SELECT f.id AS file_id, f.mtime AS mtime, mh.sha256 AS sha256 - FROM files f - LEFT JOIN merkle_hashes mh ON mh.file_id = f.id - WHERE f.full_path=? - """, - (full_path_str,), - ).fetchone() - except sqlite3.Error: - row = None - - if row is None: - return True - - stored_mtime = float(row["mtime"]) if row["mtime"] else None - stored_hash = row["sha256"] if row["sha256"] else None - file_id = int(row["file_id"]) - - # Missing Merkle data: fall back to mtime - if stored_hash is None: - if stored_mtime is None: - return True - return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE - - # If mtime is unchanged within tolerance, assume unchanged without hashing. - if stored_mtime is not None and abs(current_mtime - stored_mtime) <= MTIME_TOLERANCE: - return False - - try: - current_text = full_path_obj.read_text(encoding="utf-8", errors="ignore") - current_hash = hashlib.sha256(current_text.encode("utf-8", errors="ignore")).hexdigest() - except OSError: - return False - - if current_hash == stored_hash: - # Content unchanged, but mtime drifted: update stored mtime to avoid repeated hashing. - with self._lock: - conn = self._get_connection() - conn.execute("UPDATE files SET mtime=? WHERE id=?", (current_mtime, file_id)) - conn.commit() - return False - - return True - - def get_merkle_root_hash(self) -> Optional[str]: - """Return the stored Merkle root hash for this directory index (if present).""" - with self._lock: - conn = self._get_connection() - try: - row = conn.execute( - "SELECT root_hash FROM merkle_state WHERE id=1" - ).fetchone() - except sqlite3.Error: - return None - - return row["root_hash"] if row and row["root_hash"] else None - - def update_merkle_root(self) -> Optional[str]: - """Compute and persist the Merkle root hash for this directory index. - - The root hash includes: - - Direct file hashes from `merkle_hashes` - - Direct subdirectory root hashes (read from child `_index.db` files) - """ - if self._config is None or not getattr(self._config, "enable_merkle_detection", False): - return None - - with self._lock: - conn = self._get_connection() - try: - file_rows = conn.execute( - """ - SELECT f.name AS name, mh.sha256 AS sha256 - FROM files f - LEFT JOIN merkle_hashes mh ON mh.file_id = f.id - ORDER BY f.name - """ - ).fetchall() - - subdir_rows = conn.execute( - "SELECT name, index_path FROM subdirs ORDER BY name" - ).fetchall() - except sqlite3.Error as exc: - self.logger.debug("Failed to compute merkle root: %s", exc) - return None - - items: List[str] = [] - - for row in file_rows: - name = row["name"] - sha = (row["sha256"] or "").strip() - items.append(f"f:{name}:{sha}") - - def read_child_root(index_path: str) -> str: - try: - with sqlite3.connect(index_path) as child_conn: - child_conn.row_factory = sqlite3.Row - child_row = child_conn.execute( - "SELECT root_hash FROM merkle_state WHERE id=1" - ).fetchone() - return child_row["root_hash"] if child_row and child_row["root_hash"] else "" - except Exception: - return "" - - for row in subdir_rows: - name = row["name"] - index_path = row["index_path"] - child_hash = read_child_root(index_path) if index_path else "" - items.append(f"d:{name}:{child_hash}") - - root_hash = hashlib.sha256("\n".join(items).encode("utf-8", errors="ignore")).hexdigest() - now = time.time() - - with self._lock: - conn = self._get_connection() - try: - conn.execute( - """ - INSERT INTO merkle_state(id, root_hash, updated_at) - VALUES(1, ?, ?) - ON CONFLICT(id) DO UPDATE SET - root_hash=excluded.root_hash, - updated_at=excluded.updated_at - """, - (root_hash, now), - ) - conn.commit() - except sqlite3.Error as exc: - self.logger.debug("Failed to persist merkle root: %s", exc) - return None - - return root_hash - - def add_file_incremental( - self, - name: str, - full_path: str | Path, - content: str, - language: str, - symbols: Optional[List[Symbol]] = None, - relationships: Optional[List[CodeRelationship]] = None, - ) -> Optional[int]: - """Add or update a file only if it has changed (incremental indexing). - - Checks mtime before indexing to skip unchanged files. - - Args: - name: Filename without path - full_path: Complete source file path - content: File content for indexing - language: Programming language identifier - symbols: List of Symbol objects from the file - relationships: Optional list of CodeRelationship edges from this file - - Returns: - Database file_id if indexed, None if skipped (unchanged) - - Raises: - StorageError: If database operations fail - """ - # Check if reindexing is needed - if not self.needs_reindex(full_path): - return None # Skip unchanged file - - # File changed or new, perform full indexing - return self.add_file(name, full_path, content, language, symbols, relationships) - - def cleanup_deleted_files(self, source_dir: Path) -> int: - """Remove indexed files that no longer exist in the source directory. - - Scans the source directory and removes database entries for deleted files. - - Args: - source_dir: Source directory to scan - - Returns: - Number of deleted file entries removed - - Raises: - StorageError: If cleanup operations fail - """ - with self._lock: - conn = self._get_connection() - source_dir = source_dir.resolve() - - try: - # Get all indexed file paths - rows = conn.execute("SELECT full_path FROM files").fetchall() - indexed_paths = {row["full_path"] for row in rows} - - # Build set of existing files in source directory - existing_paths = set() - for file_path in source_dir.rglob("*"): - if file_path.is_file(): - existing_paths.add(str(file_path.resolve())) - - # Find orphaned entries (indexed but no longer exist) - deleted_paths = indexed_paths - existing_paths - - # Remove orphaned entries - deleted_count = 0 - for deleted_path in deleted_paths: - conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,)) - deleted_count += 1 - self._maybe_delete_global_symbols(deleted_path) - - if deleted_count > 0: - conn.commit() - - return deleted_count - - except Exception as exc: - conn.rollback() - raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc - - def list_files(self) -> List[FileEntry]: - """List all files in current directory. - - Returns: - List of FileEntry objects - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT id, name, full_path, language, mtime, line_count - FROM files - ORDER BY name - """ - ).fetchall() - - return [ - FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - for row in rows - ] - - def file_count(self) -> int: - """Get number of files in current directory. - - Returns: - File count - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone() - return int(row["c"]) if row else 0 - - # === Semantic Metadata === - - def add_semantic_metadata( - self, - file_id: int, - summary: str, - keywords: List[str], - purpose: str, - llm_tool: str - ) -> None: - """Add or update semantic metadata for a file. - - Args: - file_id: File ID from files table - summary: LLM-generated summary - keywords: List of keywords - purpose: Purpose/role of the file - llm_tool: Tool used to generate metadata (gemini/qwen) - """ - with self._lock: - conn = self._get_connection() - - import time - - generated_at = time.time() - - # Write to semantic_metadata table (without keywords column) - conn.execute( - """ - INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - summary=excluded.summary, - purpose=excluded.purpose, - llm_tool=excluded.llm_tool, - generated_at=excluded.generated_at - """, - (file_id, summary, purpose, llm_tool, generated_at), - ) - - # Write to normalized keywords tables for optimized search - # First, remove existing keyword associations - conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,)) - - # Then add new keywords - for keyword in keywords: - keyword = keyword.strip() - if not keyword: - continue - - # Insert keyword if it doesn't exist - conn.execute( - "INSERT OR IGNORE INTO keywords(keyword) VALUES(?)", - (keyword,) - ) - - # Get keyword_id - row = conn.execute( - "SELECT id FROM keywords WHERE keyword = ?", - (keyword,) - ).fetchone() - - if row: - keyword_id = row["id"] - # Link file to keyword - conn.execute( - "INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)", - (file_id, keyword_id) - ) - - conn.commit() - - def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]: - """Get semantic metadata for a file. - - Args: - file_id: File ID from files table - - Returns: - Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found - """ - with self._lock: - conn = self._get_connection() - - # Get semantic metadata (without keywords column) - row = conn.execute( - """ - SELECT summary, purpose, llm_tool, generated_at - FROM semantic_metadata WHERE file_id=? - """, - (file_id,), - ).fetchone() - - if not row: - return None - - # Get keywords from normalized file_keywords table - keyword_rows = conn.execute( - """ - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - ORDER BY k.keyword - """, - (file_id,), - ).fetchall() - - keywords = [kw["keyword"] for kw in keyword_rows] - - return { - "summary": row["summary"], - "keywords": keywords, - "purpose": row["purpose"], - "llm_tool": row["llm_tool"], - "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0, - } - - def get_files_without_semantic(self) -> List[FileEntry]: - """Get all files that don't have semantic metadata. - - Returns: - List of FileEntry objects without semantic metadata - """ - with self._lock: - conn = self._get_connection() - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - FROM files f - LEFT JOIN semantic_metadata sm ON f.id = sm.file_id - WHERE sm.id IS NULL - ORDER BY f.name - """ - ).fetchall() - - return [ - FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - for row in rows - ] - - def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]: - """Search files by semantic keywords. - - Args: - keyword: Keyword to search for (case-insensitive) - use_normalized: Use optimized normalized tables (default: True) - - Returns: - List of (FileEntry, keywords) tuples where keyword matches - """ - with self._lock: - conn = self._get_connection() - - if use_normalized: - # Optimized query using normalized tables with indexed lookup - # Use prefix search (keyword%) for better index utilization - keyword_pattern = f"{keyword}%" - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, - GROUP_CONCAT(k.keyword, ',') as keywords - FROM files f - JOIN file_keywords fk ON f.id = fk.file_id - JOIN keywords k ON fk.keyword_id = k.id - WHERE k.keyword LIKE ? COLLATE NOCASE - GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - ORDER BY f.name - """, - (keyword_pattern,), - ).fetchall() - - results = [] - for row in rows: - file_entry = FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - keywords = row["keywords"].split(',') if row["keywords"] else [] - results.append((file_entry, keywords)) - - return results - - else: - # Fallback using normalized tables with contains matching (slower but more flexible) - keyword_pattern = f"%{keyword}%" - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, - GROUP_CONCAT(k.keyword, ',') as keywords - FROM files f - JOIN file_keywords fk ON f.id = fk.file_id - JOIN keywords k ON fk.keyword_id = k.id - WHERE k.keyword LIKE ? COLLATE NOCASE - GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - ORDER BY f.name - """, - (keyword_pattern,), - ).fetchall() - - results = [] - for row in rows: - file_entry = FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - keywords = row["keywords"].split(',') if row["keywords"] else [] - results.append((file_entry, keywords)) - - return results - - def list_semantic_metadata( - self, - offset: int = 0, - limit: int = 50, - llm_tool: Optional[str] = None, - ) -> Tuple[List[Dict[str, Any]], int]: - """List all semantic metadata with file information. - - Args: - offset: Number of records to skip (for pagination) - limit: Maximum records to return (max 100) - llm_tool: Optional filter by LLM tool used - - Returns: - Tuple of (list of metadata dicts, total count) - """ - with self._lock: - conn = self._get_connection() - - # Query semantic metadata without keywords column - base_query = """ - SELECT f.id as file_id, f.name as file_name, f.full_path, - f.language, f.line_count, - sm.summary, sm.purpose, - sm.llm_tool, sm.generated_at - FROM files f - JOIN semantic_metadata sm ON f.id = sm.file_id - """ - count_query = """ - SELECT COUNT(*) as total - FROM files f - JOIN semantic_metadata sm ON f.id = sm.file_id - """ - - params: List[Any] = [] - if llm_tool: - base_query += " WHERE sm.llm_tool = ?" - count_query += " WHERE sm.llm_tool = ?" - params.append(llm_tool) - - base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?" - params.extend([min(limit, 100), offset]) - - count_params = [llm_tool] if llm_tool else [] - total_row = conn.execute(count_query, count_params).fetchone() - total = int(total_row["total"]) if total_row else 0 - - rows = conn.execute(base_query, params).fetchall() - - results = [] - for row in rows: - file_id = int(row["file_id"]) - - # Get keywords from normalized file_keywords table - keyword_rows = conn.execute( - """ - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - ORDER BY k.keyword - """, - (file_id,), - ).fetchall() - - keywords = [kw["keyword"] for kw in keyword_rows] - - results.append({ - "file_id": file_id, - "file_name": row["file_name"], - "full_path": row["full_path"], - "language": row["language"], - "line_count": int(row["line_count"]) if row["line_count"] else 0, - "summary": row["summary"], - "keywords": keywords, - "purpose": row["purpose"], - "llm_tool": row["llm_tool"], - "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0, - }) - - return results, total - - # === Subdirectory Links === - - def register_subdir( - self, - name: str, - index_path: str | Path, - files_count: int = 0, - direct_files: int = 0, - ) -> None: - """Register or update a subdirectory link. - - Args: - name: Subdirectory name - index_path: Path to subdirectory's _index.db - files_count: Total files recursively - direct_files: Deprecated parameter (no longer used) - """ - with self._lock: - conn = self._get_connection() - index_path_str = str(Path(index_path).resolve()) - - import time - last_updated = time.time() - - # Note: direct_files parameter is deprecated but kept for backward compatibility - conn.execute( - """ - INSERT INTO subdirs(name, index_path, files_count, last_updated) - VALUES(?, ?, ?, ?) - ON CONFLICT(name) DO UPDATE SET - index_path=excluded.index_path, - files_count=excluded.files_count, - last_updated=excluded.last_updated - """, - (name, index_path_str, files_count, last_updated), - ) - conn.commit() - - def unregister_subdir(self, name: str) -> bool: - """Remove a subdirectory link. - - Args: - name: Subdirectory name - - Returns: - True if removed, False if not found - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT id FROM subdirs WHERE name=?", (name,)).fetchone() - if not row: - return False - - conn.execute("DELETE FROM subdirs WHERE name=?", (name,)) - conn.commit() - return True - - def get_subdirs(self) -> List[SubdirLink]: - """Get all subdirectory links. - - Returns: - List of SubdirLink objects - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT id, name, index_path, files_count, last_updated - FROM subdirs - ORDER BY name - """ - ).fetchall() - - return [ - SubdirLink( - id=int(row["id"]), - name=row["name"], - index_path=Path(row["index_path"]), - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) - for row in rows - ] - - def get_subdir(self, name: str) -> Optional[SubdirLink]: - """Get a specific subdirectory link. - - Args: - name: Subdirectory name - - Returns: - SubdirLink if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - row = conn.execute( - """ - SELECT id, name, index_path, files_count, last_updated - FROM subdirs WHERE name=? - """, - (name,), - ).fetchone() - - if not row: - return None - - return SubdirLink( - id=int(row["id"]), - name=row["name"], - index_path=Path(row["index_path"]), - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) - - def update_subdir_stats( - self, name: str, files_count: int, direct_files: Optional[int] = None - ) -> None: - """Update subdirectory statistics. - - Args: - name: Subdirectory name - files_count: Total files recursively - direct_files: Deprecated parameter (no longer used) - """ - with self._lock: - conn = self._get_connection() - import time - last_updated = time.time() - - # Note: direct_files parameter is deprecated but kept for backward compatibility - conn.execute( - """ - UPDATE subdirs - SET files_count=?, last_updated=? - WHERE name=? - """, - (files_count, last_updated, name), - ) - conn.commit() - - # === Search === - - @staticmethod - def _enhance_fts_query(query: str) -> str: - """Enhance FTS5 query to support prefix matching for simple queries. - - For simple single-word or multi-word queries without FTS5 operators, - automatically adds prefix wildcard (*) to enable partial matching. - - Examples: - "loadPack" -> "loadPack*" - "load package" -> "load* package*" - "load*" -> "load*" (already has wildcard, unchanged) - "NOT test" -> "NOT test" (has FTS operator, unchanged) - - Args: - query: Original FTS5 query string - - Returns: - Enhanced query string with prefix wildcards for simple queries - """ - # Don't modify if query already contains FTS5 operators or wildcards - if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']): - return query - - # For simple queries, add prefix wildcard to each word - words = query.split() - enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words] - return ' '.join(enhanced_words) - - def _find_match_lines(self, content: str, query: str) -> List[int]: - """Find line numbers where query terms match. - - Args: - content: File content - query: Search query (FTS5 format) - - Returns: - List of 1-based line numbers containing matches - """ - # Extract search terms from FTS query (remove operators) - terms = re.findall(r'["\']([^"\']+)["\']|(\w+)', query) - search_terms = [t[0] or t[1] for t in terms if t[0] or t[1]] - # Filter out FTS operators - fts_operators = {'AND', 'OR', 'NOT', 'NEAR'} - search_terms = [t for t in search_terms if t.upper() not in fts_operators] - - if not search_terms: - return [1] # Default to first line - - lines = content.split('\n') - match_lines = [] - - for i, line in enumerate(lines, 1): - line_lower = line.lower() - for term in search_terms: - # Handle wildcard suffix - term_clean = term.rstrip('*').lower() - if term_clean and term_clean in line_lower: - match_lines.append(i) - break - - return match_lines if match_lines else [1] - - def _find_containing_symbol( - self, conn: sqlite3.Connection, file_id: int, line_num: int - ) -> Optional[Tuple[int, int, str, str]]: - """Find the symbol that contains the given line number. - - Args: - conn: Database connection - file_id: File ID in database - line_num: 1-based line number - - Returns: - Tuple of (start_line, end_line, symbol_name, symbol_kind) or None - """ - row = conn.execute( - """ - SELECT start_line, end_line, name, kind - FROM symbols - WHERE file_id = ? AND start_line <= ? AND end_line >= ? - ORDER BY (end_line - start_line) ASC - LIMIT 1 - """, - (file_id, line_num, line_num), - ).fetchone() - - if row: - return (row["start_line"], row["end_line"], row["name"], row["kind"]) - return None - - def _extract_code_block( - self, - content: str, - start_line: int, - end_line: int, - match_line: Optional[int] = None, - context_lines: int = 5, - ) -> Tuple[str, int, int]: - """Extract code block from content. - - If start_line/end_line are provided (from symbol), use them. - Otherwise, extract context around match_line. - - Args: - content: Full file content - start_line: 1-based start line (from symbol or calculated) - end_line: 1-based end line (from symbol or calculated) - match_line: 1-based line where match occurred (for context extraction) - context_lines: Number of lines before/after match when no symbol - - Returns: - Tuple of (code_block, actual_start_line, actual_end_line) - """ - lines = content.split('\n') - total_lines = len(lines) - - # Clamp to valid range - start_line = max(1, start_line) - end_line = min(total_lines, end_line) - - # Extract block (convert to 0-based index) - block_lines = lines[start_line - 1:end_line] - block_content = '\n'.join(block_lines) - - return block_content, start_line, end_line - - def _batch_fetch_symbols( - self, conn: sqlite3.Connection, file_ids: List[int] - ) -> Dict[int, List[Tuple[int, int, str, str]]]: - """Batch fetch all symbols for multiple files in a single query. - - Args: - conn: Database connection - file_ids: List of file IDs to fetch symbols for - - Returns: - Dictionary mapping file_id to list of (start_line, end_line, name, kind) tuples - """ - if not file_ids: - return {} - - # Build placeholder string for IN clause - placeholders = ','.join('?' for _ in file_ids) - rows = conn.execute( - f""" - SELECT file_id, start_line, end_line, name, kind - FROM symbols - WHERE file_id IN ({placeholders}) - ORDER BY file_id, (end_line - start_line) ASC - """, - file_ids, - ).fetchall() - - # Organize symbols by file_id - symbols_by_file: Dict[int, List[Tuple[int, int, str, str]]] = {fid: [] for fid in file_ids} - for row in rows: - symbols_by_file[row["file_id"]].append( - (row["start_line"], row["end_line"], row["name"], row["kind"]) - ) - return symbols_by_file - - def _find_containing_symbol_from_cache( - self, symbols: List[Tuple[int, int, str, str]], line_num: int - ) -> Optional[Tuple[int, int, str, str]]: - """Find the smallest symbol containing the given line number from cached symbols. - - Args: - symbols: List of (start_line, end_line, name, kind) tuples, sorted by size - line_num: 1-based line number - - Returns: - Tuple of (start_line, end_line, symbol_name, symbol_kind) or None - """ - for start_line, end_line, name, kind in symbols: - if start_line <= line_num <= end_line: - return (start_line, end_line, name, kind) - return None - - def _generate_centered_excerpt( - self, content: str, match_line: int, start_line: int, end_line: int, max_chars: int = 200 - ) -> str: - """Generate excerpt centered around the match line. - - Args: - content: Full file content - match_line: 1-based line where match occurred - start_line: 1-based start line of the code block - end_line: 1-based end line of the code block - max_chars: Maximum characters for excerpt - - Returns: - Excerpt string centered around the match - """ - lines = content.split('\n') - total_lines = len(lines) - - # Ensure match_line is within bounds - match_line = max(1, min(match_line, total_lines)) - - # Calculate context window (2 lines before, 2 lines after the match) - ctx_start = max(start_line, match_line - 2) - ctx_end = min(end_line, match_line + 2) - - # Extract and join lines - excerpt_lines = lines[ctx_start - 1:ctx_end] - excerpt = '\n'.join(excerpt_lines) - - # Truncate if too long - if len(excerpt) > max_chars: - excerpt = excerpt[:max_chars] + "..." - - return excerpt - - def _search_internal( - self, - query: str, - fts_table: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Internal unified search implementation for all FTS modes. - - Optimizations: - - Fast path: Direct FTS query with snippet() for location-only results - - Full content path: Batch fetch symbols to eliminate N+1 queries - - Centered excerpt generation for better context - - Args: - query: FTS5 query string - fts_table: FTS table name ('files_fts_exact' or 'files_fts_fuzzy') - limit: Maximum results to return - return_full_content: If True, include full code block in content field - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects - """ - with self._lock: - conn = self._get_connection() - - # Fast path: location-only results (no content processing) - if not return_full_content: - try: - rows = conn.execute( - f""" - SELECT rowid, full_path, bm25({fts_table}) AS rank, - snippet({fts_table}, 2, '', '', '...', 30) AS excerpt - FROM {fts_table} - WHERE {fts_table} MATCH ? - ORDER BY rank - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - results: List[SearchResult] = [] - for row in rows: - rank = float(row["rank"]) if row["rank"] is not None else 0.0 - score = abs(rank) if rank < 0 else 0.0 - results.append( - SearchResult( - path=row["full_path"], - score=score, - excerpt=row["excerpt"], - ) - ) - return results - - # Full content path with batch optimization - # Step 1: Get file_ids and ranks (lightweight query) - try: - id_rows = conn.execute( - f""" - SELECT rowid AS file_id, bm25({fts_table}) AS rank - FROM {fts_table} - WHERE {fts_table} MATCH ? - ORDER BY rank - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - if not id_rows: - return [] - - file_ids = [row["file_id"] for row in id_rows] - ranks_by_id = {row["file_id"]: row["rank"] for row in id_rows} - - # Step 2: Batch fetch all symbols for matched files (eliminates N+1) - symbols_by_file = self._batch_fetch_symbols(conn, file_ids) - - # Step 3: Process each file on-demand (reduces memory) - results: List[SearchResult] = [] - for file_id in file_ids: - # Fetch file content on-demand - file_row = conn.execute( - "SELECT full_path, content FROM files WHERE id = ?", - (file_id,), - ).fetchone() - - if not file_row: - continue - - file_path = file_row["full_path"] - content = file_row["content"] or "" - rank = ranks_by_id.get(file_id, 0.0) - score = abs(rank) if rank < 0 else 0.0 - - # Find matching lines - match_lines = self._find_match_lines(content, query) - first_match_line = match_lines[0] if match_lines else 1 - - # Find symbol from cached symbols (no extra SQL query) - file_symbols = symbols_by_file.get(file_id, []) - symbol_info = self._find_containing_symbol_from_cache(file_symbols, first_match_line) - - if symbol_info: - start_line, end_line, symbol_name, symbol_kind = symbol_info - else: - # No symbol found, use context around match - lines = content.split('\n') - total_lines = len(lines) - start_line = max(1, first_match_line - context_lines) - end_line = min(total_lines, first_match_line + context_lines) - symbol_name = None - symbol_kind = None - - # Extract code block - block_content, start_line, end_line = self._extract_code_block( - content, start_line, end_line - ) - - # Generate centered excerpt (improved quality) - excerpt = self._generate_centered_excerpt( - content, first_match_line, start_line, end_line - ) - - results.append( - SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=block_content, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - ) - ) - return results - - - def search_fts( - self, - query: str, - limit: int = 20, - enhance_query: bool = False, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search in current directory files. - - Uses files_fts_exact (unicode61 tokenizer) for exact token matching. - For fuzzy/substring search, use search_fts_fuzzy() instead. - - Best Practice (from industry analysis of Codanna/Code-Index-MCP): - - Default: Respects exact user input without modification - - Users can manually add wildcards (e.g., "loadPack*") for prefix matching - - Automatic enhancement (enhance_query=True) is NOT recommended as it can - violate user intent and bring unwanted noise in results - - Args: - query: FTS5 query string - limit: Maximum results to return - enhance_query: If True, automatically add prefix wildcards for simple queries. - Default False to respect exact user input. - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - final_query = self._enhance_fts_query(query) if enhance_query else query - return self._search_internal( - query=final_query, - fts_table='files_fts_exact', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_fts_exact( - self, - query: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search using exact token matching. - - Args: - query: FTS5 query string - limit: Maximum results to return - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - return self._search_internal( - query=query, - fts_table='files_fts_exact', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_fts_fuzzy( - self, - query: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search using fuzzy/substring matching. - - Args: - query: FTS5 query string - limit: Maximum results to return - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - return self._search_internal( - query=query, - fts_table='files_fts_fuzzy', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_files_only(self, query: str, limit: int = 20) -> List[str]: - """Fast FTS search returning only file paths (no snippet generation). - - Optimized for when only file paths are needed, skipping expensive - snippet() function call. - - Args: - query: FTS5 query string - limit: Maximum results to return - - Returns: - List of file paths as strings - - Raises: - StorageError: If FTS search fails - """ - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT full_path - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY bm25(files_fts) - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - return [row["full_path"] for row in rows] - - def search_symbols( - self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True - ) -> List[Symbol]: - """Search symbols by name pattern. - - Args: - name: Symbol name pattern - kind: Optional symbol kind filter - limit: Maximum results to return - prefix_mode: If True, use prefix search (faster with index); - If False, use substring search (slower) - - Returns: - List of Symbol objects - """ - # Prefix search is much faster as it can use index - if prefix_mode: - pattern = f"{name}%" - else: - pattern = f"%{name}%" - - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path - FROM symbols s - JOIN files f ON s.file_id = f.id - WHERE s.name LIKE ? AND s.kind=? - ORDER BY s.name - LIMIT ? - """, - (pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path - FROM symbols s - JOIN files f ON s.file_id = f.id - WHERE s.name LIKE ? - ORDER BY s.name - LIMIT ? - """, - (pattern, limit), - ).fetchall() - - return [ - Symbol( - name=row["name"], - kind=row["kind"], - range=(row["start_line"], row["end_line"]), - file=row["full_path"], - ) - for row in rows - ] - - def get_file_symbols(self, file_path: str | Path) -> List[Symbol]: - """Get all symbols in a specific file, sorted by start_line. - - Args: - file_path: Full path to the file - - Returns: - List of Symbol objects sorted by start_line - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - # First get the file_id - file_row = conn.execute( - "SELECT id FROM files WHERE full_path=?", - (file_path_str,), - ).fetchone() - - if not file_row: - return [] - - file_id = int(file_row["id"]) - - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line - FROM symbols s - WHERE s.file_id=? - ORDER BY s.start_line - """, - (file_id,), - ).fetchall() - - return [ - Symbol( - name=row["name"], - kind=row["kind"], - range=(row["start_line"], row["end_line"]), - file=file_path_str, - ) - for row in rows - ] - - def get_outgoing_calls( - self, - file_path: str | Path, - symbol_name: Optional[str] = None, - ) -> List[Tuple[str, str, int, Optional[str]]]: - """Get outgoing calls from symbols in a file. - - Queries code_relationships table for calls originating from symbols - in the specified file. - - Args: - file_path: Full path to the source file - symbol_name: Optional symbol name to filter by. If None, returns - calls from all symbols in the file. - - Returns: - List of tuples: (target_name, relationship_type, source_line, target_file) - - target_name: Qualified name of the call target - - relationship_type: Type of relationship (e.g., "calls", "imports") - - source_line: Line number where the call occurs - - target_file: Target file path (may be None if unknown) - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - # First get the file_id - file_row = conn.execute( - "SELECT id FROM files WHERE full_path=?", - (file_path_str,), - ).fetchone() - - if not file_row: - return [] - - file_id = int(file_row["id"]) - - if symbol_name: - rows = conn.execute( - """ - SELECT cr.target_qualified_name, cr.relationship_type, - cr.source_line, cr.target_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - WHERE s.file_id=? AND s.name=? - ORDER BY cr.source_line - """, - (file_id, symbol_name), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT cr.target_qualified_name, cr.relationship_type, - cr.source_line, cr.target_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - WHERE s.file_id=? - ORDER BY cr.source_line - """, - (file_id,), - ).fetchall() - - return [ - ( - row["target_qualified_name"], - row["relationship_type"], - int(row["source_line"]), - row["target_file"], - ) - for row in rows - ] - - def get_incoming_calls( - self, - target_name: str, - limit: int = 100, - ) -> List[Tuple[str, str, int, str]]: - """Get incoming calls/references to a target symbol. - - Queries code_relationships table for references to the specified - target symbol name. - - Args: - target_name: Name of the target symbol to find references for. - Matches against target_qualified_name (exact match, - suffix match, or contains match). - limit: Maximum number of results to return - - Returns: - List of tuples: (source_symbol_name, relationship_type, source_line, source_file) - - source_symbol_name: Name of the calling symbol - - relationship_type: Type of relationship (e.g., "calls", "imports") - - source_line: Line number where the call occurs - - source_file: Full path to the source file - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT s.name AS source_name, cr.relationship_type, - cr.source_line, f.full_path AS source_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name = ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - ORDER BY f.full_path, cr.source_line - LIMIT ? - """, - ( - target_name, - f"%.{target_name}", - f"%{target_name}", - limit, - ), - ).fetchall() - - return [ - ( - row["source_name"], - row["relationship_type"], - int(row["source_line"]), - row["source_file"], - ) - for row in rows - ] - - # === Statistics === - - def stats(self) -> Dict[str, Any]: - """Get current directory statistics. - - Returns: - Dictionary containing: - - files: Number of files in this directory - - symbols: Number of symbols - - subdirs: Number of subdirectories - - total_files: Total files including subdirectories - - languages: Dictionary of language counts - """ - with self._lock: - conn = self._get_connection() - - file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"] - symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"] - subdir_count = conn.execute("SELECT COUNT(*) AS c FROM subdirs").fetchone()["c"] - - total_files_row = conn.execute( - "SELECT COALESCE(SUM(files_count), 0) AS total FROM subdirs" - ).fetchone() - total_files = int(file_count) + int(total_files_row["total"] if total_files_row else 0) - - lang_rows = conn.execute( - "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC" - ).fetchall() - languages = {row["language"]: int(row["c"]) for row in lang_rows} - - return { - "files": int(file_count), - "symbols": int(symbol_count), - "subdirs": int(subdir_count), - "total_files": total_files, - "languages": languages, - } - - # === Internal Methods === - - def _get_connection(self) -> sqlite3.Connection: - """Get or create database connection with proper configuration. - - Returns: - sqlite3.Connection with WAL mode and foreign keys enabled - """ - if self._conn is None: - self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False) - self._conn.row_factory = sqlite3.Row - self._conn.execute("PRAGMA journal_mode=WAL") - self._conn.execute("PRAGMA synchronous=NORMAL") - self._conn.execute("PRAGMA foreign_keys=ON") - # Memory-mapped I/O for faster reads (30GB limit) - self._conn.execute("PRAGMA mmap_size=30000000000") - return self._conn - - def _maybe_update_global_symbols(self, file_path: str, symbols: List[Symbol]) -> None: - if self._global_index is None: - return - if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True): - return - try: - self._global_index.update_file_symbols( - file_path=file_path, - symbols=symbols, - index_path=str(self.db_path), - ) - except Exception as exc: - # Global index is an optimization; local directory index remains authoritative. - self.logger.debug("Global symbol index update failed for %s: %s", file_path, exc) - - def _maybe_delete_global_symbols(self, file_path: str) -> None: - if self._global_index is None: - return - if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True): - return - try: - self._global_index.delete_file_symbols(file_path) - except Exception as exc: - self.logger.debug("Global symbol index delete failed for %s: %s", file_path, exc) - - def _create_schema(self, conn: sqlite3.Connection) -> None: - """Create database schema. - - Args: - conn: Database connection - - Raises: - StorageError: If schema creation fails - """ - try: - # Files table - conn.execute( - """ - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - full_path TEXT UNIQUE NOT NULL, - language TEXT, - content TEXT, - mtime REAL, - line_count INTEGER - ) - """ - ) - - # Subdirectories table (v5: removed direct_files) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS subdirs ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL UNIQUE, - index_path TEXT NOT NULL, - files_count INTEGER DEFAULT 0, - last_updated REAL - ) - """ - ) - - # Symbols table with token metadata - conn.execute( - """ - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER REFERENCES files(id) ON DELETE CASCADE, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER - ) - """ - ) - - # Dual FTS5 external content tables for exact and fuzzy matching - # files_fts_exact: unicode61 tokenizer for exact token matching - # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching - from codexlens.storage.sqlite_utils import check_trigram_support - - has_trigram = check_trigram_support(conn) - fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-.'" - - # Exact FTS table with unicode61 tokenizer - # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW - conn.execute( - """ - CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_-.'" - ) - """ - ) - - # Fuzzy FTS table with trigram or extended unicode61 tokenizer - conn.execute( - f""" - CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="{fuzzy_tokenizer}" - ) - """ - ) - - # Semantic metadata table (v5: removed keywords column) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS semantic_metadata ( - id INTEGER PRIMARY KEY, - file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE, - summary TEXT, - purpose TEXT, - llm_tool TEXT, - generated_at REAL - ) - """ - ) - - # Normalized keywords tables for performance - conn.execute( - """ - CREATE TABLE IF NOT EXISTS keywords ( - id INTEGER PRIMARY KEY, - keyword TEXT NOT NULL UNIQUE - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS file_keywords ( - file_id INTEGER NOT NULL, - keyword_id INTEGER NOT NULL, - PRIMARY KEY (file_id, keyword_id), - FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE, - FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE - ) - """ - ) - - # Code relationships table for graph visualization - conn.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT, - FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE - ) - """ - ) - - # Precomputed graph neighbors cache for search expansion (v7) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS graph_neighbors ( - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - relationship_depth INTEGER NOT NULL, - PRIMARY KEY (source_symbol_id, neighbor_symbol_id) - ) - """ - ) - - # Merkle hashes for incremental change detection (v8) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_hashes ( - file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE, - sha256 TEXT NOT NULL, - updated_at REAL - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_state ( - id INTEGER PRIMARY KEY CHECK (id = 1), - root_hash TEXT, - updated_at REAL - ) - """ - ) - - # Indexes (v5: removed idx_symbols_type) - conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)") - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth " - "ON graph_neighbors(source_symbol_id, relationship_depth)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor " - "ON graph_neighbors(neighbor_symbol_id)" - ) - - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to create schema: {exc}") from exc - - def _migrate_v2_add_name_column(self, conn: sqlite3.Connection) -> None: - """Migration v2: Add 'name' column to files table. - - Required for FTS5 external content table. - - Args: - conn: Database connection - """ - # Check if files table exists and has columns - cursor = conn.execute("PRAGMA table_info(files)") - files_columns = {row[1] for row in cursor.fetchall()} - - if not files_columns: - return # No files table yet, will be created fresh - - # Skip if 'name' column already exists - if "name" in files_columns: - return - - # Add 'name' column with default value - conn.execute("ALTER TABLE files ADD COLUMN name TEXT NOT NULL DEFAULT ''") - - # Populate 'name' column from full_path using pathlib for robustness - rows = conn.execute("SELECT id, full_path FROM files WHERE name = ''").fetchall() - for row in rows: - file_id = row[0] - full_path = row[1] - # Use pathlib.Path.name for cross-platform compatibility - name = Path(full_path).name if full_path else "" - conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id)) - - def _create_fts_triggers(self, conn: sqlite3.Connection) -> None: - """Create FTS5 external content triggers for dual FTS tables. - - Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables. - - Args: - conn: Database connection - """ - # Insert triggers for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Delete trigger for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - - # Update trigger for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Insert trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Delete trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - - # Update trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) diff --git a/codex-lens/build/lib/codexlens/storage/file_cache.py b/codex-lens/build/lib/codexlens/storage/file_cache.py deleted file mode 100644 index b43613d1..00000000 --- a/codex-lens/build/lib/codexlens/storage/file_cache.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Simple filesystem cache helpers.""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - - -@dataclass -class FileCache: - """Caches file mtimes for incremental indexing.""" - - cache_path: Path - - def load_mtime(self, path: Path) -> Optional[float]: - try: - key = self._key_for(path) - record = (self.cache_path / key).read_text(encoding="utf-8") - return float(record) - except Exception: - return None - - def store_mtime(self, path: Path, mtime: float) -> None: - self.cache_path.mkdir(parents=True, exist_ok=True) - key = self._key_for(path) - (self.cache_path / key).write_text(str(mtime), encoding="utf-8") - - def _key_for(self, path: Path) -> str: - safe = str(path).replace(":", "_").replace("\\", "_").replace("/", "_") - return f"{safe}.mtime" - diff --git a/codex-lens/build/lib/codexlens/storage/global_index.py b/codex-lens/build/lib/codexlens/storage/global_index.py deleted file mode 100644 index badcc2be..00000000 --- a/codex-lens/build/lib/codexlens/storage/global_index.py +++ /dev/null @@ -1,398 +0,0 @@ -"""Global cross-directory symbol index for fast lookups. - -Stores symbols for an entire project in a single SQLite database so symbol search -does not require traversing every directory _index.db. - -This index is updated incrementally during file indexing (delete+insert per file) -to avoid expensive batch rebuilds. -""" - -from __future__ import annotations - -import logging -import sqlite3 -import threading -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import Symbol -from codexlens.errors import StorageError - - -class GlobalSymbolIndex: - """Project-wide symbol index with incremental updates.""" - - SCHEMA_VERSION = 1 - DEFAULT_DB_NAME = "_global_symbols.db" - - def __init__(self, db_path: str | Path, project_id: int) -> None: - self.db_path = Path(db_path).resolve() - self.project_id = int(project_id) - self._lock = threading.RLock() - self._conn: Optional[sqlite3.Connection] = None - self.logger = logging.getLogger(__name__) - - def initialize(self) -> None: - """Create database and schema if not exists.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - - current_version = self._get_schema_version(conn) - if current_version > self.SCHEMA_VERSION: - raise StorageError( - f"Database schema version {current_version} is newer than " - f"supported version {self.SCHEMA_VERSION}. " - f"Please update the application or use a compatible database.", - db_path=str(self.db_path), - operation="initialize", - details={ - "current_version": current_version, - "supported_version": self.SCHEMA_VERSION, - }, - ) - - if current_version == 0: - self._create_schema(conn) - self._set_schema_version(conn, self.SCHEMA_VERSION) - elif current_version < self.SCHEMA_VERSION: - self._apply_migrations(conn, current_version) - self._set_schema_version(conn, self.SCHEMA_VERSION) - - conn.commit() - - def close(self) -> None: - """Close database connection.""" - with self._lock: - if self._conn is not None: - try: - self._conn.close() - except Exception: - pass - finally: - self._conn = None - - def __enter__(self) -> "GlobalSymbolIndex": - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def add_symbol(self, symbol: Symbol, file_path: str | Path, index_path: str | Path) -> None: - """Insert a single symbol (idempotent) for incremental updates.""" - file_path_str = str(Path(file_path).resolve()) - index_path_str = str(Path(index_path).resolve()) - - with self._lock: - conn = self._get_connection() - try: - conn.execute( - """ - INSERT INTO global_symbols( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line, index_path - ) - VALUES(?, ?, ?, ?, ?, ?, ?) - ON CONFLICT( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - DO UPDATE SET - index_path=excluded.index_path - """, - ( - self.project_id, - symbol.name, - symbol.kind, - file_path_str, - symbol.range[0], - symbol.range[1], - index_path_str, - ), - ) - conn.commit() - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to add symbol {symbol.name}: {exc}", - db_path=str(self.db_path), - operation="add_symbol", - ) from exc - - def update_file_symbols( - self, - file_path: str | Path, - symbols: List[Symbol], - index_path: str | Path | None = None, - ) -> None: - """Replace all symbols for a file atomically (delete + insert).""" - file_path_str = str(Path(file_path).resolve()) - - index_path_str: Optional[str] - if index_path is not None: - index_path_str = str(Path(index_path).resolve()) - else: - index_path_str = self._get_existing_index_path(file_path_str) - - with self._lock: - conn = self._get_connection() - try: - conn.execute("BEGIN") - conn.execute( - "DELETE FROM global_symbols WHERE project_id=? AND file_path=?", - (self.project_id, file_path_str), - ) - - if symbols: - if not index_path_str: - raise StorageError( - "index_path is required when inserting symbols for a new file", - db_path=str(self.db_path), - operation="update_file_symbols", - details={"file_path": file_path_str}, - ) - - rows = [ - ( - self.project_id, - s.name, - s.kind, - file_path_str, - s.range[0], - s.range[1], - index_path_str, - ) - for s in symbols - ] - conn.executemany( - """ - INSERT INTO global_symbols( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line, index_path - ) - VALUES(?, ?, ?, ?, ?, ?, ?) - ON CONFLICT( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - DO UPDATE SET - index_path=excluded.index_path - """, - rows, - ) - - conn.commit() - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to update symbols for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="update_file_symbols", - ) from exc - - def delete_file_symbols(self, file_path: str | Path) -> int: - """Remove all symbols for a file. Returns number of rows deleted.""" - file_path_str = str(Path(file_path).resolve()) - with self._lock: - conn = self._get_connection() - try: - cur = conn.execute( - "DELETE FROM global_symbols WHERE project_id=? AND file_path=?", - (self.project_id, file_path_str), - ) - conn.commit() - return int(cur.rowcount or 0) - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to delete symbols for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="delete_file_symbols", - ) from exc - - def search( - self, - name: str, - kind: Optional[str] = None, - limit: int = 50, - prefix_mode: bool = True, - ) -> List[Symbol]: - """Search symbols and return full Symbol objects.""" - if prefix_mode: - pattern = f"{name}%" - else: - pattern = f"%{name}%" - - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND symbol_name LIKE ? AND symbol_kind=? - ORDER BY symbol_name - LIMIT ? - """, - (self.project_id, pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND symbol_name LIKE ? - ORDER BY symbol_name - LIMIT ? - """, - (self.project_id, pattern, limit), - ).fetchall() - - return [ - Symbol( - name=row["symbol_name"], - kind=row["symbol_kind"], - range=(row["start_line"], row["end_line"]), - file=row["file_path"], - ) - for row in rows - ] - - def search_symbols( - self, - name: str, - kind: Optional[str] = None, - limit: int = 50, - prefix_mode: bool = True, - ) -> List[Tuple[str, Tuple[int, int]]]: - """Search symbols and return only (file_path, (start_line, end_line)).""" - symbols = self.search(name=name, kind=kind, limit=limit, prefix_mode=prefix_mode) - return [(s.file or "", s.range) for s in symbols] - - def get_file_symbols(self, file_path: str | Path) -> List[Symbol]: - """Get all symbols in a specific file, sorted by start_line. - - Args: - file_path: Full path to the file - - Returns: - List of Symbol objects sorted by start_line - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND file_path=? - ORDER BY start_line - """, - (self.project_id, file_path_str), - ).fetchall() - - return [ - Symbol( - name=row["symbol_name"], - kind=row["symbol_kind"], - range=(row["start_line"], row["end_line"]), - file=row["file_path"], - ) - for row in rows - ] - - def _get_existing_index_path(self, file_path_str: str) -> Optional[str]: - with self._lock: - conn = self._get_connection() - row = conn.execute( - """ - SELECT index_path - FROM global_symbols - WHERE project_id=? AND file_path=? - LIMIT 1 - """, - (self.project_id, file_path_str), - ).fetchone() - return str(row["index_path"]) if row else None - - def _get_schema_version(self, conn: sqlite3.Connection) -> int: - try: - row = conn.execute("PRAGMA user_version").fetchone() - return int(row[0]) if row else 0 - except Exception: - return 0 - - def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None: - conn.execute(f"PRAGMA user_version = {int(version)}") - - def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None: - # No migrations yet (v1). - _ = (conn, from_version) - return - - def _get_connection(self) -> sqlite3.Connection: - if self._conn is None: - self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False) - self._conn.row_factory = sqlite3.Row - self._conn.execute("PRAGMA journal_mode=WAL") - self._conn.execute("PRAGMA synchronous=NORMAL") - self._conn.execute("PRAGMA foreign_keys=ON") - self._conn.execute("PRAGMA mmap_size=30000000000") - return self._conn - - def _create_schema(self, conn: sqlite3.Connection) -> None: - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS global_symbols ( - id INTEGER PRIMARY KEY, - project_id INTEGER NOT NULL, - symbol_name TEXT NOT NULL, - symbol_kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER, - index_path TEXT NOT NULL, - UNIQUE( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - ) - """ - ) - - # Required by optimization spec. - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_name_kind - ON global_symbols(symbol_name, symbol_kind) - """ - ) - # Used by common queries (project-scoped name lookups). - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_name_kind - ON global_symbols(project_id, symbol_name, symbol_kind) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_file - ON global_symbols(project_id, file_path) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_index_path - ON global_symbols(project_id, index_path) - """ - ) - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to initialize global symbol schema: {exc}", - db_path=str(self.db_path), - operation="_create_schema", - ) from exc - diff --git a/codex-lens/build/lib/codexlens/storage/index_tree.py b/codex-lens/build/lib/codexlens/storage/index_tree.py deleted file mode 100644 index 40ad85e7..00000000 --- a/codex-lens/build/lib/codexlens/storage/index_tree.py +++ /dev/null @@ -1,1064 +0,0 @@ -"""Hierarchical index tree builder for CodexLens. - -Constructs a bottom-up directory index tree with parallel processing support. -Each directory maintains its own _index.db with files and subdirectory links. -""" - -from __future__ import annotations - -import logging -import os -import re -import sqlite3 -import time -from concurrent.futures import ProcessPoolExecutor, as_completed -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple - -from codexlens.config import Config -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import ProjectInfo, RegistryStore - - -@dataclass -class BuildResult: - """Complete build operation result.""" - - project_id: int - source_root: Path - index_root: Path - total_files: int - total_dirs: int - errors: List[str] - - -@dataclass -class DirBuildResult: - """Single directory build result.""" - - source_path: Path - index_path: Path - files_count: int - symbols_count: int - subdirs: List[str] # Subdirectory names - error: Optional[str] = None - - -class IndexTreeBuilder: - """Hierarchical index tree builder with parallel processing. - - Builds directory indexes bottom-up to enable proper subdirectory linking. - Each directory gets its own _index.db containing: - - Files in that directory - - Links to child directory indexes - - Symbols and FTS5 search - - Attributes: - registry: Global project registry - mapper: Path mapping between source and index - config: CodexLens configuration - parser_factory: Parser factory for symbol extraction - logger: Logger instance - IGNORE_DIRS: Set of directory names to skip during indexing - """ - - # Directories to skip during indexing - IGNORE_DIRS: Set[str] = { - ".git", - ".venv", - "venv", - "node_modules", - "__pycache__", - ".codexlens", - ".idea", - ".vscode", - } - - def __init__( - self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True - ): - """Initialize the index tree builder. - - Args: - registry: Global registry store for project tracking - mapper: Path mapper for source to index conversions - config: CodexLens configuration (uses defaults if None) - incremental: Enable incremental indexing (default True) - """ - self.registry = registry - self.mapper = mapper - self.config = config or Config() - self.parser_factory = ParserFactory(self.config) - self.logger = logging.getLogger(__name__) - self.incremental = incremental - - def build( - self, - source_root: Path, - languages: List[str] = None, - workers: int = None, - force_full: bool = False, - ) -> BuildResult: - """Build complete index tree for a project. - - Process: - 1. Register project in registry - 2. Collect all directories grouped by depth - 3. Build indexes bottom-up (deepest first) - 4. Link subdirectories to parents - 5. Update project statistics - 6. Cleanup deleted files (if incremental mode) - - Args: - source_root: Project root directory to index - languages: Optional list of language IDs to limit indexing - workers: Number of parallel worker processes - force_full: Force full reindex (override incremental mode) - - Returns: - BuildResult with statistics and errors - - Raises: - ValueError: If source_root doesn't exist - """ - source_root = source_root.resolve() - if not source_root.exists(): - raise ValueError(f"Source root does not exist: {source_root}") - - # Auto-detect optimal worker count if not specified - if workers is None: - workers = min(os.cpu_count() or 4, 16) # Cap at 16 workers - self.logger.debug("Auto-detected %d workers for parallel indexing", workers) - - # Override incremental mode if force_full is True - use_incremental = self.incremental and not force_full - if force_full: - self.logger.info("Building index tree for %s (FULL reindex)", source_root) - else: - self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental) - - # Register project - index_root = self.mapper.source_to_index_dir(source_root) - project_info = self.registry.register_project(source_root, index_root) - global_index_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - - global_index: GlobalSymbolIndex | None = None - if self.config.global_symbol_index_enabled: - global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_info.id) - global_index.initialize() - - # Report progress: discovering files (5%) - print("Discovering files...", flush=True) - - # Collect directories by depth - dirs_by_depth = self._collect_dirs_by_depth(source_root, languages) - - if not dirs_by_depth: - self.logger.warning("No indexable directories found in %s", source_root) - if global_index is not None: - global_index.close() - return BuildResult( - project_id=project_info.id, - source_root=source_root, - index_root=index_root, - total_files=0, - total_dirs=0, - errors=["No indexable directories found"], - ) - - # Calculate total directories for progress tracking - total_dirs_to_process = sum(len(dirs) for dirs in dirs_by_depth.values()) - processed_dirs = 0 - - # Report progress: building index (10%) - print("Building index...", flush=True) - - total_files = 0 - total_dirs = 0 - all_errors: List[str] = [] - all_results: List[DirBuildResult] = [] # Store all results for subdir linking - - # Build bottom-up (highest depth first) - max_depth = max(dirs_by_depth.keys()) - for depth in range(max_depth, -1, -1): - if depth not in dirs_by_depth: - continue - - dirs = dirs_by_depth[depth] - self.logger.info("Building %d directories at depth %d", len(dirs), depth) - - # Build directories at this level in parallel - results = self._build_level_parallel( - dirs, - languages, - workers, - project_id=project_info.id, - global_index_db_path=global_index_db_path, - ) - all_results.extend(results) - - # Process results - for result in results: - if result.error: - all_errors.append(f"{result.source_path}: {result.error}") - processed_dirs += 1 - continue - - total_files += result.files_count - total_dirs += 1 - processed_dirs += 1 - - # Report progress for each processed directory (10-80%) - # Use "Processing file" format for frontend parser compatibility - progress_percent = 10 + int((processed_dirs / total_dirs_to_process) * 70) - print(f"Processing file {processed_dirs}/{total_dirs_to_process}: {result.source_path.name}", flush=True) - - # Register directory in registry - self.registry.register_dir( - project_id=project_info.id, - source_path=result.source_path, - index_path=result.index_path, - depth=self.mapper.get_relative_depth(result.source_path, source_root), - files_count=result.files_count, - ) - - # Report progress: linking subdirectories (80%) - print("Linking subdirectories...", flush=True) - - # After building all directories, link subdirectories to parents - # This needs to happen after all indexes exist - for result in all_results: - if result.error: - continue - # Link children to this directory - self._link_children_to_parent(result.source_path, all_results) - - # Cleanup deleted files if in incremental mode - if use_incremental: - # Report progress: cleaning up (90%) - print("Cleaning up deleted files...", flush=True) - self.logger.info("Cleaning up deleted files...") - total_deleted = 0 - for result in all_results: - if result.error: - continue - try: - with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store: - deleted_count = store.cleanup_deleted_files(result.source_path) - if deleted_count > 0: - _compute_graph_neighbors(store, logger=self.logger) - store.update_merkle_root() - total_deleted += deleted_count - if deleted_count > 0: - self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path) - except Exception as exc: - self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc) - - if total_deleted > 0: - self.logger.info("Removed %d deleted files from index", total_deleted) - - # Report progress: finalizing (95%) - print("Finalizing...", flush=True) - - # Update project statistics - self.registry.update_project_stats(source_root, total_files, total_dirs) - - # Report completion (100%) - print(f"Indexed {total_files} files", flush=True) - - self.logger.info( - "Index build complete: %d files, %d directories, %d errors", - total_files, - total_dirs, - len(all_errors), - ) - - if global_index is not None: - global_index.close() - - return BuildResult( - project_id=project_info.id, - source_root=source_root, - index_root=index_root, - total_files=total_files, - total_dirs=total_dirs, - errors=all_errors, - ) - - def update_subtree( - self, - source_path: Path, - languages: List[str] = None, - workers: int = None, - ) -> BuildResult: - """Incrementally update a subtree. - - Rebuilds indexes for the specified directory and all subdirectories. - Useful for incremental updates when only part of the tree changed. - - Args: - source_path: Root of subtree to update - languages: Optional list of language IDs to limit indexing - workers: Number of parallel worker processes - - Returns: - BuildResult for the subtree - - Raises: - ValueError: If source_path is not indexed - """ - source_path = source_path.resolve() - project_root = self.mapper.get_project_root(source_path) - - # Get project info - project_info = self.registry.get_project(project_root) - if not project_info: - raise ValueError(f"Directory not indexed: {source_path}") - - self.logger.info("Updating subtree at %s", source_path) - - # Use build logic but start from source_path - return self.build(source_path, languages, workers) - - def rebuild_dir(self, source_path: Path) -> DirBuildResult: - """Rebuild index for a single directory. - - Only rebuilds the specified directory, does not touch subdirectories. - Useful for updating a single directory after file changes. - - Args: - source_path: Directory to rebuild - - Returns: - DirBuildResult for the directory - """ - source_path = source_path.resolve() - self.logger.info("Rebuilding directory %s", source_path) - project_root = self.mapper.get_project_root(source_path) - project_info = self.registry.get_project(project_root) - if not project_info: - raise ValueError(f"Directory not indexed: {source_path}") - - global_index_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - return self._build_single_dir( - source_path, - languages=None, - project_id=project_info.id, - global_index_db_path=global_index_db_path, - ) - - # === Internal Methods === - - def _collect_dirs_by_depth( - self, source_root: Path, languages: List[str] = None - ) -> Dict[int, List[Path]]: - """Collect all indexable directories grouped by depth. - - Walks the directory tree and groups directories by their depth - relative to source_root. Depth 0 is the root itself. - - Args: - source_root: Root directory to start from - languages: Optional language filter - - Returns: - Dictionary mapping depth to list of directory paths - Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]} - """ - source_root = source_root.resolve() - dirs_by_depth: Dict[int, List[Path]] = {} - - # Always include the root directory at depth 0 for chain search entry point - dirs_by_depth[0] = [source_root] - - for root, dirnames, _ in os.walk(source_root): - # Filter out ignored directories - dirnames[:] = [ - d - for d in dirnames - if d not in self.IGNORE_DIRS and not d.startswith(".") - ] - - root_path = Path(root) - - # Skip root (already added) - if root_path == source_root: - continue - - # Check if this directory should be indexed - if not self._should_index_dir(root_path, languages): - continue - - # Calculate depth relative to source_root - try: - depth = len(root_path.relative_to(source_root).parts) - except ValueError: - continue - - if depth not in dirs_by_depth: - dirs_by_depth[depth] = [] - - dirs_by_depth[depth].append(root_path) - - return dirs_by_depth - - def _should_index_dir(self, dir_path: Path, languages: List[str] = None) -> bool: - """Check if directory should be indexed. - - A directory is indexed if: - 1. It's not in IGNORE_DIRS - 2. It doesn't start with '.' - 3. It contains at least one supported language file, OR - 4. It has subdirectories that contain supported files (transitive) - - Args: - dir_path: Directory to check - languages: Optional language filter - - Returns: - True if directory should be indexed - """ - # Check directory name - if dir_path.name in self.IGNORE_DIRS or dir_path.name.startswith("."): - return False - - # Check for supported files in this directory - source_files = self._iter_source_files(dir_path, languages) - if len(source_files) > 0: - return True - - # Check if any subdirectory has indexable files (transitive) - # This handles cases like 'src' which has no direct files but has 'src/codexlens' - for item in dir_path.iterdir(): - if not item.is_dir(): - continue - if item.name in self.IGNORE_DIRS or item.name.startswith("."): - continue - # Recursively check subdirectories - if self._has_indexable_files_recursive(item, languages): - return True - - return False - - def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool: - """Check if directory or any subdirectory has indexable files. - - Args: - dir_path: Directory to check - languages: Optional language filter - - Returns: - True if directory tree contains indexable files - """ - # Check for supported files in this directory - source_files = self._iter_source_files(dir_path, languages) - if len(source_files) > 0: - return True - - # Check subdirectories - try: - for item in dir_path.iterdir(): - if not item.is_dir(): - continue - if item.name in self.IGNORE_DIRS or item.name.startswith("."): - continue - if self._has_indexable_files_recursive(item, languages): - return True - except PermissionError: - pass - - return False - - def _build_level_parallel( - self, - dirs: List[Path], - languages: List[str], - workers: int, - *, - project_id: int, - global_index_db_path: Path, - ) -> List[DirBuildResult]: - """Build multiple directories in parallel. - - Uses ProcessPoolExecutor to build directories concurrently. - All directories at the same level are independent and can be - processed in parallel. - - Args: - dirs: List of directories to build - languages: Language filter - workers: Number of worker processes - - Returns: - List of DirBuildResult objects - """ - results: List[DirBuildResult] = [] - - if not dirs: - return results - - # For single directory, avoid overhead of process pool - if len(dirs) == 1: - result = self._build_single_dir( - dirs[0], - languages, - project_id=project_id, - global_index_db_path=global_index_db_path, - ) - return [result] - - # Prepare arguments for worker processes - config_dict = { - "data_dir": str(self.config.data_dir), - "supported_languages": self.config.supported_languages, - "parsing_rules": self.config.parsing_rules, - "global_symbol_index_enabled": self.config.global_symbol_index_enabled, - } - - worker_args = [ - ( - dir_path, - self.mapper.source_to_index_db(dir_path), - languages, - config_dict, - int(project_id), - str(global_index_db_path), - ) - for dir_path in dirs - ] - - # Execute in parallel - with ProcessPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(_build_dir_worker, args): args[0] - for args in worker_args - } - - for future in as_completed(futures): - try: - result = future.result() - results.append(result) - except Exception as exc: - dir_path = futures[future] - self.logger.error("Failed to build %s: %s", dir_path, exc) - results.append( - DirBuildResult( - source_path=dir_path, - index_path=self.mapper.source_to_index_db(dir_path), - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) - ) - - return results - - def _build_single_dir( - self, - dir_path: Path, - languages: List[str] = None, - *, - project_id: int, - global_index_db_path: Path, - ) -> DirBuildResult: - """Build index for a single directory. - - Creates _index.db and indexes all files in the directory. - Does not recurse into subdirectories. - - Args: - dir_path: Directory to index - languages: Optional language filter - - Returns: - DirBuildResult with statistics and subdirectory list - """ - dir_path = dir_path.resolve() - index_db_path = self.mapper.source_to_index_db(dir_path) - - global_index: GlobalSymbolIndex | None = None - try: - # Ensure index directory exists - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - # Create directory index - if self.config.global_symbol_index_enabled: - global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_id) - global_index.initialize() - - store = DirIndexStore(index_db_path, config=self.config, global_index=global_index) - store.initialize() - - # Get source files in this directory only - source_files = self._iter_source_files(dir_path, languages) - - files_count = 0 - symbols_count = 0 - skipped_count = 0 - - for file_path in source_files: - try: - # Check if file needs reindexing (incremental mode) - if self.incremental and not store.needs_reindex(file_path): - skipped_count += 1 - continue - - # Read and parse file - text = file_path.read_text(encoding="utf-8", errors="ignore") - language_id = self.config.language_for_path(file_path) - if not language_id: - continue - - parser = self.parser_factory.get_parser(language_id) - indexed_file = parser.parse(text, file_path) - - # Add to directory index - store.add_file( - name=file_path.name, - full_path=file_path, - content=text, - language=language_id, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - files_count += 1 - symbols_count += len(indexed_file.symbols) - - except Exception as exc: - self.logger.debug("Failed to index %s: %s", file_path, exc) - continue - - if files_count > 0: - _compute_graph_neighbors(store, logger=self.logger) - - # Get list of subdirectories - subdirs = [ - d.name - for d in dir_path.iterdir() - if d.is_dir() - and d.name not in self.IGNORE_DIRS - and not d.name.startswith(".") - ] - - store.update_merkle_root() - store.close() - if global_index is not None: - global_index.close() - - if skipped_count > 0: - self.logger.debug( - "Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs", - dir_path, - files_count, - skipped_count, - symbols_count, - len(subdirs), - ) - else: - self.logger.debug( - "Built %s: %d files, %d symbols, %d subdirs", - dir_path, - files_count, - symbols_count, - len(subdirs), - ) - - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=files_count, - symbols_count=symbols_count, - subdirs=subdirs, - ) - - except Exception as exc: - self.logger.error("Failed to build directory %s: %s", dir_path, exc) - if global_index is not None: - try: - global_index.close() - except Exception: - pass - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) - - def _link_children_to_parent( - self, parent_path: Path, all_results: List[DirBuildResult] - ) -> None: - """Link child directory indexes to parent's subdirs table. - - Finds all direct children of parent_path in all_results and - registers them as subdirectories in the parent's index. - - Args: - parent_path: Parent directory path - all_results: List of all build results - """ - parent_index_db = self.mapper.source_to_index_db(parent_path) - - try: - with DirIndexStore(parent_index_db, config=self.config) as store: - for result in all_results: - # Only register direct children (parent is one level up) - if result.source_path.parent != parent_path: - continue - - if result.error: - continue - - # Register subdirectory link - store.register_subdir( - name=result.source_path.name, - index_path=result.index_path, - files_count=result.files_count, - direct_files=result.files_count, - ) - self.logger.debug( - "Linked %s to parent %s", - result.source_path.name, - parent_path, - ) - - store.update_merkle_root() - - except Exception as exc: - self.logger.error( - "Failed to link children to %s: %s", parent_path, exc - ) - - def _iter_source_files( - self, dir_path: Path, languages: List[str] = None - ) -> List[Path]: - """Iterate source files in directory (non-recursive). - - Returns files in the specified directory that match language filters. - Does not recurse into subdirectories. - - Args: - dir_path: Directory to scan - languages: Optional language filter - - Returns: - List of source file paths - """ - files: List[Path] = [] - - if not dir_path.is_dir(): - return files - - for item in dir_path.iterdir(): - if not item.is_file(): - continue - - if item.name.startswith("."): - continue - - # Check language support - language_id = self.config.language_for_path(item) - if not language_id: - continue - - # Apply language filter - if languages and language_id not in languages: - continue - - files.append(item) - - return files - - -def _normalize_relationship_target(target: str) -> str: - """Best-effort normalization of a relationship target into a local symbol name.""" - target = (target or "").strip() - if not target: - return "" - - # Drop trailing call parentheses when present (e.g., "foo()" -> "foo"). - if target.endswith("()"): - target = target[:-2] - - # Keep the leaf identifier for common qualified formats. - for sep in ("::", ".", "#"): - if sep in target: - target = target.split(sep)[-1] - - # Strip non-identifier suffix/prefix noise. - target = re.sub(r"^[^A-Za-z0-9_]+", "", target) - target = re.sub(r"[^A-Za-z0-9_]+$", "", target) - return target - - -def _compute_graph_neighbors( - store: DirIndexStore, - *, - max_depth: int = 2, - logger: Optional[logging.Logger] = None, -) -> None: - """Compute and persist N-hop neighbors for all symbols in a directory index.""" - if max_depth <= 0: - return - - log = logger or logging.getLogger(__name__) - - with store._lock: - conn = store._get_connection() - conn.row_factory = sqlite3.Row - - # Ensure schema exists even for older databases pinned to the same user_version. - try: - from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade - - upgrade(conn) - except Exception as exc: - log.debug("Graph neighbor schema ensure failed: %s", exc) - - cursor = conn.cursor() - - try: - cursor.execute("DELETE FROM graph_neighbors") - except sqlite3.Error: - # Table missing or schema mismatch; skip gracefully. - return - - try: - symbol_rows = cursor.execute( - "SELECT id, file_id, name FROM symbols" - ).fetchall() - rel_rows = cursor.execute( - "SELECT source_symbol_id, target_qualified_name FROM code_relationships" - ).fetchall() - except sqlite3.Error: - return - - if not symbol_rows or not rel_rows: - try: - conn.commit() - except sqlite3.Error: - pass - return - - symbol_file_by_id: Dict[int, int] = {} - symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {} - symbols_by_name: Dict[str, List[int]] = {} - - for row in symbol_rows: - symbol_id = int(row["id"]) - file_id = int(row["file_id"]) - name = str(row["name"]) - symbol_file_by_id[symbol_id] = file_id - symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id) - symbols_by_name.setdefault(name, []).append(symbol_id) - - adjacency: Dict[int, Set[int]] = {} - - for row in rel_rows: - source_id = int(row["source_symbol_id"]) - target_raw = str(row["target_qualified_name"] or "") - target_name = _normalize_relationship_target(target_raw) - if not target_name: - continue - - source_file_id = symbol_file_by_id.get(source_id) - if source_file_id is None: - continue - - candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name)) - if not candidate_ids: - global_candidates = symbols_by_name.get(target_name, []) - # Only resolve cross-file by name when unambiguous. - candidate_ids = global_candidates if len(global_candidates) == 1 else [] - - for target_id in candidate_ids: - if target_id == source_id: - continue - adjacency.setdefault(source_id, set()).add(target_id) - adjacency.setdefault(target_id, set()).add(source_id) - - if not adjacency: - try: - conn.commit() - except sqlite3.Error: - pass - return - - insert_rows: List[Tuple[int, int, int]] = [] - max_depth = min(int(max_depth), 2) - - for source_id, first_hop in adjacency.items(): - if not first_hop: - continue - for neighbor_id in first_hop: - insert_rows.append((source_id, neighbor_id, 1)) - - if max_depth < 2: - continue - - second_hop: Set[int] = set() - for neighbor_id in first_hop: - second_hop.update(adjacency.get(neighbor_id, set())) - - second_hop.discard(source_id) - second_hop.difference_update(first_hop) - - for neighbor_id in second_hop: - insert_rows.append((source_id, neighbor_id, 2)) - - if not insert_rows: - try: - conn.commit() - except sqlite3.Error: - pass - return - - try: - cursor.executemany( - """ - INSERT INTO graph_neighbors( - source_symbol_id, neighbor_symbol_id, relationship_depth - ) - VALUES(?, ?, ?) - """, - insert_rows, - ) - conn.commit() - except sqlite3.Error: - return - - -# === Worker Function for ProcessPoolExecutor === - - -def _build_dir_worker(args: tuple) -> DirBuildResult: - """Worker function for parallel directory building. - - Must be at module level for ProcessPoolExecutor pickling. - Reconstructs necessary objects from serializable arguments. - - Args: - args: Tuple of (dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path) - - Returns: - DirBuildResult for the directory - """ - dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path = args - - # Reconstruct config - config = Config( - data_dir=Path(config_dict["data_dir"]), - supported_languages=config_dict["supported_languages"], - parsing_rules=config_dict["parsing_rules"], - global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)), - ) - - parser_factory = ParserFactory(config) - - global_index: GlobalSymbolIndex | None = None - try: - # Ensure index directory exists - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - # Create directory index - if config.global_symbol_index_enabled and global_index_db_path: - global_index = GlobalSymbolIndex(Path(global_index_db_path), project_id=int(project_id)) - global_index.initialize() - - store = DirIndexStore(index_db_path, config=config, global_index=global_index) - store.initialize() - - files_count = 0 - symbols_count = 0 - - # Index files in this directory - for item in dir_path.iterdir(): - if not item.is_file(): - continue - - if item.name.startswith("."): - continue - - language_id = config.language_for_path(item) - if not language_id: - continue - - if languages and language_id not in languages: - continue - - try: - text = item.read_text(encoding="utf-8", errors="ignore") - parser = parser_factory.get_parser(language_id) - indexed_file = parser.parse(text, item) - - store.add_file( - name=item.name, - full_path=item, - content=text, - language=language_id, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - files_count += 1 - symbols_count += len(indexed_file.symbols) - - except Exception: - continue - - if files_count > 0: - _compute_graph_neighbors(store) - - # Get subdirectories - ignore_dirs = { - ".git", - ".venv", - "venv", - "node_modules", - "__pycache__", - ".codexlens", - ".idea", - ".vscode", - } - - subdirs = [ - d.name - for d in dir_path.iterdir() - if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".") - ] - - store.update_merkle_root() - store.close() - if global_index is not None: - global_index.close() - - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=files_count, - symbols_count=symbols_count, - subdirs=subdirs, - ) - - except Exception as exc: - if global_index is not None: - try: - global_index.close() - except Exception: - pass - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) diff --git a/codex-lens/build/lib/codexlens/storage/merkle_tree.py b/codex-lens/build/lib/codexlens/storage/merkle_tree.py deleted file mode 100644 index c8c76988..00000000 --- a/codex-lens/build/lib/codexlens/storage/merkle_tree.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Merkle tree utilities for change detection. - -This module provides a generic, file-system based Merkle tree implementation -that can be used to efficiently diff directory states. -""" - -from __future__ import annotations - -import hashlib -from dataclasses import dataclass, field -from pathlib import Path -from typing import Dict, Iterable, List, Optional - - -def sha256_bytes(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() - - -def sha256_text(text: str) -> str: - return sha256_bytes(text.encode("utf-8", errors="ignore")) - - -@dataclass -class MerkleNode: - """A Merkle node representing either a file (leaf) or directory (internal).""" - - name: str - rel_path: str - hash: str - is_dir: bool - children: Dict[str, "MerkleNode"] = field(default_factory=dict) - - def iter_files(self) -> Iterable["MerkleNode"]: - if not self.is_dir: - yield self - return - for child in self.children.values(): - yield from child.iter_files() - - -@dataclass -class MerkleTree: - """Merkle tree for a directory snapshot.""" - - root: MerkleNode - - @classmethod - def build_from_directory(cls, root_dir: Path) -> "MerkleTree": - root_dir = Path(root_dir).resolve() - node = cls._build_node(root_dir, base=root_dir) - return cls(root=node) - - @classmethod - def _build_node(cls, path: Path, *, base: Path) -> MerkleNode: - if path.is_file(): - rel = str(path.relative_to(base)).replace("\\", "/") - return MerkleNode( - name=path.name, - rel_path=rel, - hash=sha256_bytes(path.read_bytes()), - is_dir=False, - ) - - if not path.is_dir(): - rel = str(path.relative_to(base)).replace("\\", "/") - return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False) - - children: Dict[str, MerkleNode] = {} - for child in sorted(path.iterdir(), key=lambda p: p.name): - child_node = cls._build_node(child, base=base) - children[child_node.name] = child_node - - items = [ - f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}" - for name, n in sorted(children.items(), key=lambda kv: kv[0]) - ] - dir_hash = sha256_text("\n".join(items)) - - rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/") - return MerkleNode( - name="." if path == base else path.name, - rel_path=rel_path, - hash=dir_hash, - is_dir=True, - children=children, - ) - - @staticmethod - def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]: - """Find changed/added/removed files between two trees. - - Returns: - List of relative file paths (POSIX-style separators). - """ - if old is None and new is None: - return [] - if old is None: - return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr] - if new is None: - return sorted({n.rel_path for n in old.root.iter_files()}) - - changed: set[str] = set() - - def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None: - if old_node is None and new_node is None: - return - - if old_node is None and new_node is not None: - changed.update(n.rel_path for n in new_node.iter_files()) - return - - if new_node is None and old_node is not None: - changed.update(n.rel_path for n in old_node.iter_files()) - return - - assert old_node is not None and new_node is not None - - if old_node.hash == new_node.hash: - return - - if not old_node.is_dir and not new_node.is_dir: - changed.add(new_node.rel_path) - return - - if old_node.is_dir != new_node.is_dir: - changed.update(n.rel_path for n in old_node.iter_files()) - changed.update(n.rel_path for n in new_node.iter_files()) - return - - names = set(old_node.children.keys()) | set(new_node.children.keys()) - for name in names: - walk(old_node.children.get(name), new_node.children.get(name)) - - walk(old.root, new.root) - return sorted(changed) - diff --git a/codex-lens/build/lib/codexlens/storage/migration_manager.py b/codex-lens/build/lib/codexlens/storage/migration_manager.py deleted file mode 100644 index d8690806..00000000 --- a/codex-lens/build/lib/codexlens/storage/migration_manager.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Manages database schema migrations. - -This module provides a framework for applying versioned migrations to the SQLite -database. Migrations are discovered from the `codexlens.storage.migrations` -package and applied sequentially. The database schema version is tracked using -the `user_version` pragma. -""" - -import importlib -import logging -import pkgutil -from pathlib import Path -from sqlite3 import Connection -from typing import List, NamedTuple - -log = logging.getLogger(__name__) - - -class Migration(NamedTuple): - """Represents a single database migration.""" - - version: int - name: str - upgrade: callable - - -def discover_migrations() -> List[Migration]: - """ - Discovers and returns a sorted list of database migrations. - - Migrations are expected to be in the `codexlens.storage.migrations` package, - with filenames in the format `migration_XXX_description.py`, where XXX is - the version number. Each migration module must contain an `upgrade` function - that takes a `sqlite3.Connection` object as its argument. - - Returns: - A list of Migration objects, sorted by version. - """ - import codexlens.storage.migrations - - migrations = [] - package_path = Path(codexlens.storage.migrations.__file__).parent - - for _, name, _ in pkgutil.iter_modules([str(package_path)]): - if name.startswith("migration_"): - try: - version = int(name.split("_")[1]) - module = importlib.import_module(f"codexlens.storage.migrations.{name}") - if hasattr(module, "upgrade"): - migrations.append( - Migration(version=version, name=name, upgrade=module.upgrade) - ) - else: - log.warning(f"Migration {name} is missing 'upgrade' function.") - except (ValueError, IndexError) as e: - log.warning(f"Could not parse migration name {name}: {e}") - except ImportError as e: - log.warning(f"Could not import migration {name}: {e}") - - migrations.sort(key=lambda m: m.version) - return migrations - - -class MigrationManager: - """ - Manages the application of migrations to a database. - """ - - def __init__(self, db_conn: Connection): - """ - Initializes the MigrationManager. - - Args: - db_conn: The SQLite database connection. - """ - self.db_conn = db_conn - self.migrations = discover_migrations() - - def get_current_version(self) -> int: - """ - Gets the current version of the database schema. - - Returns: - The current schema version number. - """ - return self.db_conn.execute("PRAGMA user_version").fetchone()[0] - - def set_version(self, version: int): - """ - Sets the database schema version. - - Args: - version: The version number to set. - """ - self.db_conn.execute(f"PRAGMA user_version = {version}") - log.info(f"Database schema version set to {version}") - - def apply_migrations(self): - """ - Applies all pending migrations to the database. - - This method checks the current database version and applies all - subsequent migrations in order. Each migration is applied within - a transaction, unless the migration manages its own transactions. - """ - current_version = self.get_current_version() - log.info(f"Current database schema version: {current_version}") - - for migration in self.migrations: - if migration.version > current_version: - log.info(f"Applying migration {migration.version}: {migration.name}...") - try: - # Check if a transaction is already in progress - in_transaction = self.db_conn.in_transaction - - # Only start transaction if not already in one - if not in_transaction: - self.db_conn.execute("BEGIN") - - migration.upgrade(self.db_conn) - self.set_version(migration.version) - - # Only commit if we started the transaction and it's still active - if not in_transaction and self.db_conn.in_transaction: - self.db_conn.execute("COMMIT") - - log.info( - f"Successfully applied migration {migration.version}: {migration.name}" - ) - except Exception as e: - log.error( - f"Failed to apply migration {migration.version}: {migration.name}. Error: {e}", - exc_info=True, - ) - # Try to rollback if transaction is active - try: - if self.db_conn.in_transaction: - self.db_conn.execute("ROLLBACK") - except Exception: - pass # Ignore rollback errors - raise - - latest_migration_version = self.migrations[-1].version if self.migrations else 0 - if current_version < latest_migration_version: - # This case can be hit if migrations were applied but the loop was exited - # and set_version was not called for the last one for some reason. - # To be safe, we explicitly set the version to the latest known migration. - final_version = self.get_current_version() - if final_version != latest_migration_version: - log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.") - - log.info("All pending migrations applied successfully.") - diff --git a/codex-lens/build/lib/codexlens/storage/migrations/__init__.py b/codex-lens/build/lib/codexlens/storage/migrations/__init__.py deleted file mode 100644 index 06e14729..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file makes the 'migrations' directory a Python package. diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_001_normalize_keywords.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_001_normalize_keywords.py deleted file mode 100644 index 97df06fd..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_001_normalize_keywords.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -Migration 001: Normalize keywords into separate tables. - -This migration introduces two new tables, `keywords` and `file_keywords`, to -store semantic keywords in a normalized fashion. It then migrates the existing -keywords from the `semantic_data` JSON blob in the `files` table into these -new tables. This is intended to speed up keyword-based searches significantly. -""" - -import json -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to normalize keywords. - - - Creates `keywords` and `file_keywords` tables. - - Creates indexes for efficient querying. - - Migrates data from `files.semantic_data` to the new tables. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Creating 'keywords' and 'file_keywords' tables...") - # Create a table to store unique keywords - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS keywords ( - id INTEGER PRIMARY KEY, - keyword TEXT NOT NULL UNIQUE - ) - """ - ) - - # Create a join table to link files and keywords (many-to-many) - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS file_keywords ( - file_id INTEGER NOT NULL, - keyword_id INTEGER NOT NULL, - PRIMARY KEY (file_id, keyword_id), - FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE, - FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE - ) - """ - ) - - log.info("Creating indexes for new keyword tables...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)") - - log.info("Migrating existing keywords from 'semantic_metadata' table...") - - # Check if semantic_metadata table exists before querying - cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'") - if not cursor.fetchone(): - log.info("No 'semantic_metadata' table found, skipping data migration.") - return - - # Check if 'keywords' column exists in semantic_metadata table - # (current schema may already use normalized tables without this column) - cursor.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - if "keywords" not in columns: - log.info("No 'keywords' column in semantic_metadata table, skipping data migration.") - return - - cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''") - - files_to_migrate = cursor.fetchall() - if not files_to_migrate: - log.info("No existing files with semantic metadata to migrate.") - return - - log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.") - - for file_id, keywords_json in files_to_migrate: - if not keywords_json: - continue - try: - keywords = json.loads(keywords_json) - - if not isinstance(keywords, list): - log.warning(f"Keywords for file_id {file_id} is not a list, skipping.") - continue - - for keyword in keywords: - if not isinstance(keyword, str): - log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.") - continue - - keyword = keyword.strip() - if not keyword: - continue - - # Get or create keyword_id - cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,)) - cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,)) - keyword_id_result = cursor.fetchone() - - if keyword_id_result: - keyword_id = keyword_id_result[0] - # Link file to keyword - cursor.execute( - "INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)", - (file_id, keyword_id), - ) - else: - log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}") - - except json.JSONDecodeError as e: - log.warning(f"Could not parse keywords for file_id {file_id}: {e}") - except Exception as e: - log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True) - - log.info("Finished migrating keywords.") diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_002_add_token_metadata.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_002_add_token_metadata.py deleted file mode 100644 index daa3085e..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_002_add_token_metadata.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Migration 002: Add token_count and symbol_type to symbols table. - -This migration adds token counting metadata to symbols for accurate chunk -splitting and performance optimization. It also adds symbol_type for better -filtering in searches. -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to add token metadata to symbols. - - - Adds token_count column to symbols table - - Adds symbol_type column to symbols table (for future use) - - Creates index on symbol_type for efficient filtering - - Backfills existing symbols with NULL token_count (to be calculated lazily) - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Adding token_count column to symbols table...") - try: - cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER") - log.info("Successfully added token_count column.") - except Exception as e: - # Column might already exist - log.warning(f"Could not add token_count column (might already exist): {e}") - - log.info("Adding symbol_type column to symbols table...") - try: - cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT") - log.info("Successfully added symbol_type column.") - except Exception as e: - # Column might already exist - log.warning(f"Could not add symbol_type column (might already exist): {e}") - - log.info("Creating index on symbol_type for efficient filtering...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)") - - log.info("Migration 002 completed successfully.") diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_004_dual_fts.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_004_dual_fts.py deleted file mode 100644 index 502e067d..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_004_dual_fts.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Migration 004: Add dual FTS tables for exact and fuzzy matching. - -This migration introduces two FTS5 tables: -- files_fts_exact: Uses unicode61 tokenizer for exact token matching -- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching - -Both tables are synchronized with the files table via triggers for automatic updates. -""" - -import logging -from sqlite3 import Connection - -from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to add dual FTS tables. - - - Drops old files_fts table and triggers - - Creates files_fts_exact with unicode61 tokenizer - - Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer - - Creates synchronized triggers for both tables - - Rebuilds FTS indexes from files table - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - try: - # Check trigram support - has_trigram = check_trigram_support(db_conn) - version = get_sqlite_version(db_conn) - log.info(f"SQLite version: {'.'.join(map(str, version))}") - - if has_trigram: - log.info("Trigram tokenizer available, using for fuzzy FTS table") - fuzzy_tokenizer = "trigram" - else: - log.warning( - f"Trigram tokenizer not available (requires SQLite >= 3.34), " - f"using extended unicode61 tokenizer for fuzzy matching" - ) - fuzzy_tokenizer = "unicode61 tokenchars '_-.'" - - # Start transaction - cursor.execute("BEGIN TRANSACTION") - - # Check if files table has 'name' column (v2 schema doesn't have it) - cursor.execute("PRAGMA table_info(files)") - columns = {row[1] for row in cursor.fetchall()} - - if 'name' not in columns: - log.info("Adding 'name' column to files table (v2 schema upgrade)...") - # Add name column - cursor.execute("ALTER TABLE files ADD COLUMN name TEXT") - # Populate name from path (extract filename from last '/') - # Use Python to do the extraction since SQLite doesn't have reverse() - cursor.execute("SELECT rowid, path FROM files") - rows = cursor.fetchall() - for rowid, path in rows: - # Extract filename from path - name = path.split('/')[-1] if '/' in path else path - cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid)) - - # Rename 'path' column to 'full_path' if needed - if 'path' in columns and 'full_path' not in columns: - log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...") - # Check if indexed_at column exists in v2 schema - has_indexed_at = 'indexed_at' in columns - has_mtime = 'mtime' in columns - - # SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation - cursor.execute(""" - CREATE TABLE files_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL, - full_path TEXT NOT NULL UNIQUE, - content TEXT, - language TEXT, - mtime REAL, - indexed_at TEXT - ) - """) - - # Build INSERT statement based on available columns - # Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT - if has_indexed_at and has_mtime: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at) - SELECT name, path, content, language, mtime, indexed_at FROM files - """) - elif has_indexed_at: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, indexed_at) - SELECT name, path, content, language, indexed_at FROM files - """) - elif has_mtime: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, mtime) - SELECT name, path, content, language, mtime FROM files - """) - else: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language) - SELECT name, path, content, language FROM files - """) - - cursor.execute("DROP TABLE files") - cursor.execute("ALTER TABLE files_new RENAME TO files") - - log.info("Dropping old FTS triggers and table...") - # Drop old triggers - cursor.execute("DROP TRIGGER IF EXISTS files_ai") - cursor.execute("DROP TRIGGER IF EXISTS files_ad") - cursor.execute("DROP TRIGGER IF EXISTS files_au") - - # Drop old FTS table - cursor.execute("DROP TABLE IF EXISTS files_fts") - - # Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars) - # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW - log.info("Creating files_fts_exact table with unicode61 tokenizer...") - cursor.execute( - """ - CREATE VIRTUAL TABLE files_fts_exact USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_-.'" - ) - """ - ) - - # Create fuzzy FTS table (trigram or extended unicode61) - log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...") - cursor.execute( - f""" - CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="{fuzzy_tokenizer}" - ) - """ - ) - - # Create synchronized triggers for files_fts_exact - log.info("Creating triggers for files_fts_exact...") - cursor.execute( - """ - CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Create synchronized triggers for files_fts_fuzzy - log.info("Creating triggers for files_fts_fuzzy...") - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Rebuild FTS indexes from files table - log.info("Rebuilding FTS indexes from files table...") - cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')") - cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')") - - # Commit transaction - cursor.execute("COMMIT") - log.info("Migration 004 completed successfully") - - # Vacuum to reclaim space (outside transaction) - try: - log.info("Running VACUUM to reclaim space...") - cursor.execute("VACUUM") - except Exception as e: - log.warning(f"VACUUM failed (non-critical): {e}") - - except Exception as e: - log.error(f"Migration 004 failed: {e}") - try: - cursor.execute("ROLLBACK") - except Exception: - pass - raise diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py deleted file mode 100644 index 918bf17a..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Migration 005: Remove unused and redundant database fields. - -This migration removes four problematic fields identified by Gemini analysis: - -1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table) - - Data: Migrated to normalized file_keywords table in migration 001 - - Impact: Column now redundant, remove to prevent sync issues - -2. **symbols.token_count** (unused - always NULL) - - Data: Never populated, always NULL - - Impact: No data loss, just removes unused column - -3. **symbols.symbol_type** (redundant - duplicates kind) - - Data: Redundant with symbols.kind field - - Impact: No data loss, kind field contains same information - -4. **subdirs.direct_files** (unused - never displayed) - - Data: Never used in queries or display logic - - Impact: No data loss, just removes unused column - -Schema changes use table recreation pattern (SQLite best practice): -- Create new table without deprecated columns -- Copy data from old table -- Drop old table -- Rename new table -- Recreate indexes -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """Remove unused and redundant fields from schema. - - Note: Transaction management is handled by MigrationManager. - This migration should NOT start its own transaction. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - # Step 1: Remove semantic_metadata.keywords (if column exists) - log.info("Checking semantic_metadata.keywords column...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'" - ) - if cursor.fetchone(): - # Check if keywords column exists - cursor.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - - if "keywords" in columns: - log.info("Removing semantic_metadata.keywords column...") - cursor.execute(""" - CREATE TABLE semantic_metadata_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_id INTEGER NOT NULL UNIQUE, - summary TEXT, - purpose TEXT, - llm_tool TEXT, - generated_at REAL, - FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE - ) - """) - - cursor.execute(""" - INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at) - SELECT id, file_id, summary, purpose, llm_tool, generated_at - FROM semantic_metadata - """) - - cursor.execute("DROP TABLE semantic_metadata") - cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata") - - # Recreate index - cursor.execute( - "CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)" - ) - log.info("Removed semantic_metadata.keywords column") - else: - log.info("semantic_metadata.keywords column does not exist, skipping") - else: - log.info("semantic_metadata table does not exist, skipping") - - # Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist) - log.info("Checking symbols.token_count and symbols.symbol_type columns...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'" - ) - if cursor.fetchone(): - # Check if token_count or symbol_type columns exist - cursor.execute("PRAGMA table_info(symbols)") - columns = {row[1] for row in cursor.fetchall()} - - if "token_count" in columns or "symbol_type" in columns: - log.info("Removing symbols.token_count and symbols.symbol_type columns...") - cursor.execute(""" - CREATE TABLE symbols_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT, - start_line INTEGER, - end_line INTEGER, - FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE - ) - """) - - cursor.execute(""" - INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line) - SELECT id, file_id, name, kind, start_line, end_line - FROM symbols - """) - - cursor.execute("DROP TABLE symbols") - cursor.execute("ALTER TABLE symbols_new RENAME TO symbols") - - # Recreate indexes - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - log.info("Removed symbols.token_count and symbols.symbol_type columns") - else: - log.info("symbols.token_count/symbol_type columns do not exist, skipping") - else: - log.info("symbols table does not exist, skipping") - - # Step 3: Remove subdirs.direct_files (if column exists) - log.info("Checking subdirs.direct_files column...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'" - ) - if cursor.fetchone(): - # Check if direct_files column exists - cursor.execute("PRAGMA table_info(subdirs)") - columns = {row[1] for row in cursor.fetchall()} - - if "direct_files" in columns: - log.info("Removing subdirs.direct_files column...") - cursor.execute(""" - CREATE TABLE subdirs_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL UNIQUE, - index_path TEXT NOT NULL, - files_count INTEGER DEFAULT 0, - last_updated REAL - ) - """) - - cursor.execute(""" - INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated) - SELECT id, name, index_path, files_count, last_updated - FROM subdirs - """) - - cursor.execute("DROP TABLE subdirs") - cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs") - - # Recreate index - cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)") - log.info("Removed subdirs.direct_files column") - else: - log.info("subdirs.direct_files column does not exist, skipping") - else: - log.info("subdirs table does not exist, skipping") - - log.info("Migration 005 completed successfully") - - # Vacuum to reclaim space (outside transaction, optional) - # Note: VACUUM cannot run inside a transaction, so we skip it here - # The caller can run VACUUM separately if desired - - -def downgrade(db_conn: Connection): - """Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files). - - This is a placeholder - true downgrade is not feasible as data is lost. - The migration is designed to be one-way since removed fields are unused/redundant. - - Args: - db_conn: The SQLite database connection. - """ - log.warning( - "Migration 005 downgrade not supported - removed fields are unused/redundant. " - "Data cannot be restored." - ) - raise NotImplementedError( - "Migration 005 downgrade not supported - this is a one-way migration" - ) diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_006_enhance_relationships.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_006_enhance_relationships.py deleted file mode 100644 index 2c7c6cd8..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_006_enhance_relationships.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Migration 006: Ensure relationship tables and indexes exist. - -This migration is intentionally idempotent. It creates the `code_relationships` -table (used for graph visualization) and its indexes if missing. -""" - -from __future__ import annotations - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Ensuring code_relationships table exists...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ) - """ - ) - - log.info("Ensuring relationship indexes exist...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)") - diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_007_add_graph_neighbors.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_007_add_graph_neighbors.py deleted file mode 100644 index 83306886..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_007_add_graph_neighbors.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Migration 007: Add precomputed graph neighbor table for search expansion. - -Adds: -- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids) - -This table is derived data (a cache) and is safe to rebuild at any time. -The migration is intentionally idempotent. -""" - -from __future__ import annotations - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Creating graph_neighbors table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS graph_neighbors ( - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - relationship_depth INTEGER NOT NULL, - PRIMARY KEY (source_symbol_id, neighbor_symbol_id) - ) - """ - ) - - log.info("Creating indexes for graph_neighbors...") - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth - ON graph_neighbors(source_symbol_id, relationship_depth) - """ - ) - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor - ON graph_neighbors(neighbor_symbol_id) - """ - ) - diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_008_add_merkle_hashes.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_008_add_merkle_hashes.py deleted file mode 100644 index 092fc20a..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_008_add_merkle_hashes.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Migration 008: Add Merkle hash tables for content-based incremental indexing. - -Adds: -- merkle_hashes: per-file SHA-256 hashes (keyed by file_id) -- merkle_state: directory-level root hash (single row, id=1) - -Backfills merkle_hashes using the existing `files.content` column when available. -""" - -from __future__ import annotations - -import hashlib -import logging -import time -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Creating merkle_hashes table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_hashes ( - file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE, - sha256 TEXT NOT NULL, - updated_at REAL - ) - """ - ) - - log.info("Creating merkle_state table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_state ( - id INTEGER PRIMARY KEY CHECK (id = 1), - root_hash TEXT, - updated_at REAL - ) - """ - ) - - # Backfill file hashes from stored content (best-effort). - try: - rows = cursor.execute("SELECT id, content FROM files").fetchall() - except Exception as exc: - log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc) - return - - now = time.time() - inserts: list[tuple[int, str, float]] = [] - - for row in rows: - file_id = int(row[0]) - content = row[1] - if content is None: - continue - try: - digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest() - inserts.append((file_id, digest, now)) - except Exception: - continue - - if not inserts: - return - - log.info("Backfilling %d file hashes...", len(inserts)) - cursor.executemany( - """ - INSERT INTO merkle_hashes(file_id, sha256, updated_at) - VALUES(?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - sha256=excluded.sha256, - updated_at=excluded.updated_at - """, - inserts, - ) - diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_009_add_splade.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_009_add_splade.py deleted file mode 100644 index c675233e..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_009_add_splade.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Migration 009: Add SPLADE sparse retrieval tables. - -This migration introduces SPLADE (Sparse Lexical AnD Expansion) support: -- splade_metadata: Model configuration (model name, vocab size, ONNX path) -- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight) - -The SPLADE tables are designed for efficient sparse vector retrieval: -- Token-based lookup for query expansion -- Chunk-based deletion for index maintenance -- Maintains backward compatibility with existing FTS tables -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - """ - Adds SPLADE tables for sparse retrieval. - - Creates: - - splade_metadata: Stores model configuration and ONNX path - - splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings - - Indexes for efficient token-based and chunk-based lookups - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Creating splade_metadata table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS splade_metadata ( - id INTEGER PRIMARY KEY DEFAULT 1, - model_name TEXT NOT NULL, - vocab_size INTEGER NOT NULL, - onnx_path TEXT, - created_at REAL - ) - """ - ) - - log.info("Creating splade_posting_list table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS splade_posting_list ( - token_id INTEGER NOT NULL, - chunk_id INTEGER NOT NULL, - weight REAL NOT NULL, - PRIMARY KEY (token_id, chunk_id), - FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE - ) - """ - ) - - log.info("Creating indexes for splade_posting_list...") - # Index for efficient chunk-based lookups (deletion, updates) - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_splade_by_chunk - ON splade_posting_list(chunk_id) - """ - ) - - # Index for efficient term-based retrieval - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_splade_by_token - ON splade_posting_list(token_id) - """ - ) - - log.info("Migration 009 completed successfully") - - -def downgrade(db_conn: Connection) -> None: - """ - Removes SPLADE tables. - - Drops: - - splade_posting_list (and associated indexes) - - splade_metadata - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Dropping SPLADE indexes...") - cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk") - cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token") - - log.info("Dropping splade_posting_list table...") - cursor.execute("DROP TABLE IF EXISTS splade_posting_list") - - log.info("Dropping splade_metadata table...") - cursor.execute("DROP TABLE IF EXISTS splade_metadata") - - log.info("Migration 009 downgrade completed successfully") diff --git a/codex-lens/build/lib/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py b/codex-lens/build/lib/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py deleted file mode 100644 index 9a937200..00000000 --- a/codex-lens/build/lib/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Migration 010: Add multi-vector storage support for cascade retrieval. - -This migration introduces the chunks table with multi-vector support: -- chunks: Stores code chunks with multiple embedding types - - embedding: Original embedding for backward compatibility - - embedding_binary: 256-dim binary vector for coarse ranking (fast) - - embedding_dense: 2048-dim dense vector for fine ranking (precise) - -The multi-vector architecture enables cascade retrieval: -1. First stage: Fast binary vector search for candidate retrieval -2. Second stage: Dense vector reranking for precision -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - """ - Adds chunks table with multi-vector embedding columns. - - Creates: - - chunks: Table for storing code chunks with multiple embedding types - - idx_chunks_file_path: Index for efficient file-based lookups - - Also migrates existing chunks tables by adding new columns if needed. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - # Check if chunks table already exists - table_exists = cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if table_exists: - # Migrate existing table - add new columns if missing - log.info("chunks table exists, checking for missing columns...") - - col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall() - existing_columns = {row[1] for row in col_info} - - if "embedding_binary" not in existing_columns: - log.info("Adding embedding_binary column to chunks table...") - cursor.execute( - "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB" - ) - - if "embedding_dense" not in existing_columns: - log.info("Adding embedding_dense column to chunks table...") - cursor.execute( - "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB" - ) - else: - # Create new table with all columns - log.info("Creating chunks table with multi-vector support...") - cursor.execute( - """ - CREATE TABLE chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - embedding_binary BLOB, - embedding_dense BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - - # Create index for file-based lookups - log.info("Creating index for chunks table...") - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_chunks_file_path - ON chunks(file_path) - """ - ) - - log.info("Migration 010 completed successfully") - - -def downgrade(db_conn: Connection) -> None: - """ - Removes multi-vector columns from chunks table. - - Note: This does not drop the chunks table entirely to preserve data. - Only the new columns added by this migration are removed. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Removing multi-vector columns from chunks table...") - - # SQLite doesn't support DROP COLUMN directly in older versions - # We need to recreate the table without the columns - - # Check if chunks table exists - table_exists = cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if not table_exists: - log.info("chunks table does not exist, nothing to downgrade") - return - - # Check if the columns exist before trying to remove them - col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall() - existing_columns = {row[1] for row in col_info} - - needs_migration = ( - "embedding_binary" in existing_columns or - "embedding_dense" in existing_columns - ) - - if not needs_migration: - log.info("Multi-vector columns not present, nothing to remove") - return - - # Recreate table without the new columns - log.info("Recreating chunks table without multi-vector columns...") - - cursor.execute( - """ - CREATE TABLE chunks_backup ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - - cursor.execute( - """ - INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at) - SELECT id, file_path, content, embedding, metadata, created_at FROM chunks - """ - ) - - cursor.execute("DROP TABLE chunks") - cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks") - - # Recreate index - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_chunks_file_path - ON chunks(file_path) - """ - ) - - log.info("Migration 010 downgrade completed successfully") diff --git a/codex-lens/build/lib/codexlens/storage/path_mapper.py b/codex-lens/build/lib/codexlens/storage/path_mapper.py deleted file mode 100644 index a7cbd701..00000000 --- a/codex-lens/build/lib/codexlens/storage/path_mapper.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Path mapping utilities for source paths and index paths. - -This module provides bidirectional mapping between source code directories -and their corresponding index storage locations. - -Storage Structure: - ~/.codexlens/ - ├── registry.db # Global mapping table - └── indexes/ - └── D/ - └── Claude_dms3/ - ├── _index.db # Root directory index - └── src/ - └── _index.db # src/ directory index -""" - -import json -import os -import platform -from pathlib import Path -from typing import Optional - - -def _get_configured_index_root() -> Path: - """Get the index root from environment or config file. - - Priority order: - 1. CODEXLENS_INDEX_DIR environment variable - 2. index_dir from ~/.codexlens/config.json - 3. Default: ~/.codexlens/indexes - """ - env_override = os.getenv("CODEXLENS_INDEX_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - - config_file = Path.home() / ".codexlens" / "config.json" - if config_file.exists(): - try: - cfg = json.loads(config_file.read_text(encoding="utf-8")) - if "index_dir" in cfg: - return Path(cfg["index_dir"]).expanduser().resolve() - except (json.JSONDecodeError, OSError): - pass - - return Path.home() / ".codexlens" / "indexes" - - -class PathMapper: - """Bidirectional mapping tool for source paths ↔ index paths. - - Handles cross-platform path normalization and conversion between - source code directories and their index storage locations. - - Attributes: - DEFAULT_INDEX_ROOT: Default root directory for all indexes - INDEX_DB_NAME: Standard name for index database files - index_root: Configured index root directory - """ - - DEFAULT_INDEX_ROOT = _get_configured_index_root() - INDEX_DB_NAME = "_index.db" - - def __init__(self, index_root: Optional[Path] = None): - """Initialize PathMapper with optional custom index root. - - Args: - index_root: Custom index root directory. If None, uses DEFAULT_INDEX_ROOT. - """ - self.index_root = (index_root or self.DEFAULT_INDEX_ROOT).resolve() - - def source_to_index_dir(self, source_path: Path) -> Path: - """Convert source directory to its index directory path. - - Maps a source code directory to where its index data should be stored. - The mapping preserves the directory structure but normalizes paths - for cross-platform compatibility. - - Args: - source_path: Source directory path to map - - Returns: - Index directory path under index_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.source_to_index_dir(Path("D:/Claude_dms3/src")) - PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src') - - >>> mapper.source_to_index_dir(Path("/home/user/project")) - PosixPath('/home/user/.codexlens/indexes/home/user/project') - """ - source_path = source_path.resolve() - normalized = self.normalize_path(source_path) - return self.index_root / normalized - - def source_to_index_db(self, source_path: Path) -> Path: - """Convert source directory to its index database file path. - - Maps a source directory to the full path of its index database file, - including the standard INDEX_DB_NAME. - - Args: - source_path: Source directory path to map - - Returns: - Full path to the index database file - - Examples: - >>> mapper = PathMapper() - >>> mapper.source_to_index_db(Path("D:/Claude_dms3/src")) - PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src/_index.db') - """ - index_dir = self.source_to_index_dir(source_path) - return index_dir / self.INDEX_DB_NAME - - def index_to_source(self, index_path: Path) -> Path: - """Convert index path back to original source path. - - Performs reverse mapping from an index storage location to the - original source directory. Handles both directory paths and - database file paths. - - Args: - index_path: Index directory or database file path - - Returns: - Original source directory path - - Raises: - ValueError: If index_path is not under index_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.index_to_source( - ... Path("~/.codexlens/indexes/D/Claude_dms3/src/_index.db") - ... ) - WindowsPath('D:/Claude_dms3/src') - - >>> mapper.index_to_source( - ... Path("~/.codexlens/indexes/D/Claude_dms3/src") - ... ) - WindowsPath('D:/Claude_dms3/src') - """ - index_path = index_path.resolve() - - # Remove _index.db if present - if index_path.name == self.INDEX_DB_NAME: - index_path = index_path.parent - - # Verify path is under index_root - try: - relative = index_path.relative_to(self.index_root) - except ValueError: - raise ValueError( - f"Index path {index_path} is not under index root {self.index_root}" - ) - - # Convert normalized path back to source path - normalized_str = str(relative).replace("\\", "/") - return self.denormalize_path(normalized_str) - - def get_project_root(self, source_path: Path) -> Path: - """Find the project root directory (topmost indexed directory). - - Walks up the directory tree to find the highest-level directory - that has an index database. - - Args: - source_path: Source directory to start from - - Returns: - Project root directory path. Returns source_path itself if - no parent index is found. - - Examples: - >>> mapper = PathMapper() - >>> mapper.get_project_root(Path("D:/Claude_dms3/src/codexlens")) - WindowsPath('D:/Claude_dms3') - """ - source_path = source_path.resolve() - current = source_path - project_root = source_path - - # Walk up the tree - while current.parent != current: # Stop at filesystem root - parent_index_db = self.source_to_index_db(current.parent) - if parent_index_db.exists(): - project_root = current.parent - current = current.parent - else: - break - - return project_root - - def get_relative_depth(self, source_path: Path, project_root: Path) -> int: - """Calculate directory depth relative to project root. - - Args: - source_path: Target directory path - project_root: Project root directory path - - Returns: - Number of directory levels from project_root to source_path - - Raises: - ValueError: If source_path is not under project_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.get_relative_depth( - ... Path("D:/Claude_dms3/src/codexlens"), - ... Path("D:/Claude_dms3") - ... ) - 2 - """ - source_path = source_path.resolve() - project_root = project_root.resolve() - - try: - relative = source_path.relative_to(project_root) - # Count path components - return len(relative.parts) - except ValueError: - raise ValueError( - f"Source path {source_path} is not under project root {project_root}" - ) - - def normalize_path(self, path: Path) -> str: - """Normalize path to cross-platform storage format. - - Converts OS-specific paths to a standardized format for storage: - - Windows: Removes drive colons (D: → D) - - Unix: Removes leading slash - - Uses forward slashes throughout - - Args: - path: Path to normalize - - Returns: - Normalized path string - - Examples: - >>> mapper = PathMapper() - >>> mapper.normalize_path(Path("D:/path/to/dir")) - 'D/path/to/dir' - - >>> mapper.normalize_path(Path("/home/user/path")) - 'home/user/path' - """ - path = path.resolve() - path_str = str(path) - - # Handle Windows paths with drive letters - if platform.system() == "Windows" and len(path.parts) > 0: - # Convert D:\path\to\dir → D/path/to/dir - drive = path.parts[0].replace(":", "") # D: → D - rest = Path(*path.parts[1:]) if len(path.parts) > 1 else Path() - normalized = f"{drive}/{rest}".replace("\\", "/") - return normalized.rstrip("/") - - # Handle Unix paths - # /home/user/path → home/user/path - return path_str.lstrip("/").replace("\\", "/") - - def denormalize_path(self, normalized: str) -> Path: - """Convert normalized path back to OS-specific path. - - Reverses the normalization process to restore OS-native path format: - - Windows: Adds drive colons (D → D:) - - Unix: Adds leading slash - - Args: - normalized: Normalized path string - - Returns: - OS-specific Path object - - Examples: - >>> mapper = PathMapper() - >>> mapper.denormalize_path("D/path/to/dir") # On Windows - WindowsPath('D:/path/to/dir') - - >>> mapper.denormalize_path("home/user/path") # On Unix - PosixPath('/home/user/path') - """ - parts = normalized.split("/") - - # Handle Windows paths - if platform.system() == "Windows" and len(parts) > 0: - # Check if first part is a drive letter - if len(parts[0]) == 1 and parts[0].isalpha(): - # D/path/to/dir → D:/path/to/dir - drive = f"{parts[0]}:" - if len(parts) > 1: - return Path(drive) / Path(*parts[1:]) - return Path(drive) - - # Handle Unix paths or relative paths - # home/user/path → /home/user/path - return Path("/") / Path(*parts) diff --git a/codex-lens/build/lib/codexlens/storage/registry.py b/codex-lens/build/lib/codexlens/storage/registry.py deleted file mode 100644 index 6a4469ab..00000000 --- a/codex-lens/build/lib/codexlens/storage/registry.py +++ /dev/null @@ -1,683 +0,0 @@ -"""Global project registry for CodexLens - SQLite storage.""" - -from __future__ import annotations - -import platform -import sqlite3 -import threading -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.errors import StorageError - - -@dataclass -class ProjectInfo: - """Registered project information.""" - - id: int - source_root: Path - index_root: Path - created_at: float - last_indexed: float - total_files: int - total_dirs: int - status: str - - -@dataclass -class DirMapping: - """Directory to index path mapping.""" - - id: int - project_id: int - source_path: Path - index_path: Path - depth: int - files_count: int - last_updated: float - - -class RegistryStore: - """Global project registry - SQLite storage. - - Manages indexed projects and directory-to-index path mappings. - Thread-safe with connection pooling. - """ - - DEFAULT_DB_PATH = Path.home() / ".codexlens" / "registry.db" - - def __init__(self, db_path: Path | None = None) -> None: - self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve() - self._lock = threading.RLock() - self._local = threading.local() - self._pool_lock = threading.Lock() - self._pool: Dict[int, sqlite3.Connection] = {} - self._pool_generation = 0 - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection.""" - thread_id = threading.get_ident() - if getattr(self._local, "generation", None) == self._pool_generation: - conn = getattr(self._local, "conn", None) - if conn is not None: - return conn - - with self._pool_lock: - conn = self._pool.get(thread_id) - if conn is None: - conn = sqlite3.connect(self.db_path, check_same_thread=False) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - self._pool[thread_id] = conn - - self._local.conn = conn - self._local.generation = self._pool_generation - return conn - - def close(self) -> None: - """Close all pooled connections.""" - with self._lock: - with self._pool_lock: - for conn in self._pool.values(): - conn.close() - self._pool.clear() - self._pool_generation += 1 - - if hasattr(self._local, "conn"): - self._local.conn = None - if hasattr(self._local, "generation"): - self._local.generation = self._pool_generation - - def __enter__(self) -> RegistryStore: - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def initialize(self) -> None: - """Create database and schema.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - self._create_schema(conn) - - def _create_schema(self, conn: sqlite3.Connection) -> None: - """Create database schema.""" - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS projects ( - id INTEGER PRIMARY KEY, - source_root TEXT UNIQUE NOT NULL, - index_root TEXT NOT NULL, - created_at REAL, - last_indexed REAL, - total_files INTEGER DEFAULT 0, - total_dirs INTEGER DEFAULT 0, - status TEXT DEFAULT 'active' - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS dir_mapping ( - id INTEGER PRIMARY KEY, - project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE, - source_path TEXT NOT NULL, - index_path TEXT NOT NULL, - depth INTEGER, - files_count INTEGER DEFAULT 0, - last_updated REAL, - UNIQUE(source_path) - ) - """ - ) - - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_dir_source ON dir_mapping(source_path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_dir_project ON dir_mapping(project_id)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_project_source ON projects(source_root)" - ) - - conn.commit() - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to initialize registry schema: {exc}") from exc - - def _normalize_path_for_comparison(self, path: Path) -> str: - """Normalize paths for comparisons and storage. - - Windows paths are treated as case-insensitive, so normalize to lowercase. - Unix platforms preserve case sensitivity. - """ - path_str = str(path) - if platform.system() == "Windows": - return path_str.lower() - return path_str - - # === Project Operations === - - def register_project(self, source_root: Path, index_root: Path) -> ProjectInfo: - """Register a new project or update existing one. - - Args: - source_root: Source code root directory - index_root: Index storage root directory - - Returns: - ProjectInfo for the registered project - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - index_root_str = str(index_root.resolve()) - now = time.time() - - conn.execute( - """ - INSERT INTO projects(source_root, index_root, created_at, last_indexed) - VALUES(?, ?, ?, ?) - ON CONFLICT(source_root) DO UPDATE SET - index_root=excluded.index_root, - last_indexed=excluded.last_indexed, - status='active' - """, - (source_root_str, index_root_str, now, now), - ) - - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - conn.commit() - - if not row: - raise StorageError(f"Failed to register project: {source_root}") - - return self._row_to_project_info(row) - - def unregister_project(self, source_root: Path) -> bool: - """Remove a project registration (cascades to directory mappings). - - Args: - source_root: Source code root directory - - Returns: - True if project was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - row = conn.execute( - "SELECT id FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - if not row: - return False - - conn.execute("DELETE FROM projects WHERE source_root=?", (source_root_str,)) - conn.commit() - return True - - def get_project(self, source_root: Path) -> Optional[ProjectInfo]: - """Get project information by source root. - - Args: - source_root: Source code root directory - - Returns: - ProjectInfo if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - return self._row_to_project_info(row) if row else None - - def get_project_by_id(self, project_id: int) -> Optional[ProjectInfo]: - """Get project information by ID. - - Args: - project_id: Project database ID - - Returns: - ProjectInfo if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - - row = conn.execute( - "SELECT * FROM projects WHERE id=?", (project_id,) - ).fetchone() - - return self._row_to_project_info(row) if row else None - - def list_projects(self, status: Optional[str] = None) -> List[ProjectInfo]: - """List all registered projects. - - Args: - status: Optional status filter ('active', 'stale', 'removed') - - Returns: - List of ProjectInfo objects - """ - with self._lock: - conn = self._get_connection() - - if status: - rows = conn.execute( - "SELECT * FROM projects WHERE status=? ORDER BY created_at DESC", - (status,), - ).fetchall() - else: - rows = conn.execute( - "SELECT * FROM projects ORDER BY created_at DESC" - ).fetchall() - - return [self._row_to_project_info(row) for row in rows] - - def update_project_stats( - self, source_root: Path, total_files: int, total_dirs: int - ) -> None: - """Update project statistics. - - Args: - source_root: Source code root directory - total_files: Total number of indexed files - total_dirs: Total number of indexed directories - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - conn.execute( - """ - UPDATE projects - SET total_files=?, total_dirs=?, last_indexed=? - WHERE source_root=? - """, - (total_files, total_dirs, time.time(), source_root_str), - ) - conn.commit() - - def set_project_status(self, source_root: Path, status: str) -> None: - """Set project status. - - Args: - source_root: Source code root directory - status: Status string ('active', 'stale', 'removed') - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - conn.execute( - "UPDATE projects SET status=? WHERE source_root=?", - (status, source_root_str), - ) - conn.commit() - - # === Directory Mapping Operations === - - def register_dir( - self, - project_id: int, - source_path: Path, - index_path: Path, - depth: int, - files_count: int = 0, - ) -> DirMapping: - """Register a directory mapping. - - Args: - project_id: Project database ID - source_path: Source directory path - index_path: Index database path - depth: Directory depth relative to project root - files_count: Number of files in directory - - Returns: - DirMapping for the registered directory - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - index_path_str = str(index_path.resolve()) - now = time.time() - - conn.execute( - """ - INSERT INTO dir_mapping( - project_id, source_path, index_path, depth, files_count, last_updated - ) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(source_path) DO UPDATE SET - index_path=excluded.index_path, - depth=excluded.depth, - files_count=excluded.files_count, - last_updated=excluded.last_updated - """, - (project_id, source_path_str, index_path_str, depth, files_count, now), - ) - - row = conn.execute( - "SELECT * FROM dir_mapping WHERE source_path=?", (source_path_str,) - ).fetchone() - - conn.commit() - - if not row: - raise StorageError(f"Failed to register directory: {source_path}") - - return self._row_to_dir_mapping(row) - - def unregister_dir(self, source_path: Path) -> bool: - """Remove a directory mapping. - - Args: - source_path: Source directory path - - Returns: - True if directory was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - row = conn.execute( - "SELECT id FROM dir_mapping WHERE source_path=?", (source_path_str,) - ).fetchone() - - if not row: - return False - - conn.execute("DELETE FROM dir_mapping WHERE source_path=?", (source_path_str,)) - conn.commit() - return True - - def find_index_path(self, source_path: Path) -> Optional[Path]: - """Find index path for a source directory (exact match). - - Args: - source_path: Source directory path - - Returns: - Index path if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - row = conn.execute( - "SELECT index_path FROM dir_mapping WHERE source_path=?", - (source_path_str,), - ).fetchone() - - return Path(row["index_path"]) if row else None - - def find_nearest_index(self, source_path: Path) -> Optional[DirMapping]: - """Find nearest indexed ancestor directory. - - Searches for the closest parent directory that has an index. - Useful for supporting subdirectory searches. - - Optimized to use single database query instead of iterating through - each parent directory level. - - Args: - source_path: Source directory or file path - - Returns: - DirMapping for nearest ancestor, None if not found - """ - with self._lock: - conn = self._get_connection() - source_path_resolved = source_path.resolve() - - # Build list of all parent paths from deepest to shallowest - paths_to_check = [] - current = source_path_resolved - while True: - paths_to_check.append(self._normalize_path_for_comparison(current)) - parent = current.parent - if parent == current: # Reached filesystem root - break - current = parent - - if not paths_to_check: - return None - - # Single query with WHERE IN, ordered by path length (longest = nearest) - placeholders = ','.join('?' * len(paths_to_check)) - query = f""" - SELECT * FROM dir_mapping - WHERE source_path IN ({placeholders}) - ORDER BY LENGTH(source_path) DESC - LIMIT 1 - """ - - row = conn.execute(query, paths_to_check).fetchone() - return self._row_to_dir_mapping(row) if row else None - - def find_by_source_path(self, source_path: str) -> Optional[Dict[str, str]]: - """Find project by source path (exact or nearest match). - - Searches for a project whose source_root matches or contains - the given source_path. - - Args: - source_path: Source directory path as string - - Returns: - Dict with project info including 'index_root', or None if not found - """ - with self._lock: - conn = self._get_connection() - resolved_path = Path(source_path).resolve() - source_path_resolved = self._normalize_path_for_comparison(resolved_path) - - # First try exact match on projects table - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_path_resolved,) - ).fetchone() - - if row: - return { - "id": str(row["id"]), - "source_root": row["source_root"], - "index_root": row["index_root"], - "status": row["status"] or "active", - } - - # Try finding project that contains this path - # Build list of all parent paths - paths_to_check = [] - current = resolved_path - while True: - paths_to_check.append(self._normalize_path_for_comparison(current)) - parent = current.parent - if parent == current: - break - current = parent - - if paths_to_check: - placeholders = ','.join('?' * len(paths_to_check)) - query = f""" - SELECT * FROM projects - WHERE source_root IN ({placeholders}) - ORDER BY LENGTH(source_root) DESC - LIMIT 1 - """ - row = conn.execute(query, paths_to_check).fetchone() - - if row: - return { - "id": str(row["id"]), - "source_root": row["source_root"], - "index_root": row["index_root"], - "status": row["status"] or "active", - } - - return None - - def get_project_dirs(self, project_id: int) -> List[DirMapping]: - """Get all directory mappings for a project. - - Args: - project_id: Project database ID - - Returns: - List of DirMapping objects - """ - with self._lock: - conn = self._get_connection() - - rows = conn.execute( - "SELECT * FROM dir_mapping WHERE project_id=? ORDER BY depth, source_path", - (project_id,), - ).fetchall() - - return [self._row_to_dir_mapping(row) for row in rows] - - def get_subdirs(self, source_path: Path) -> List[DirMapping]: - """Get direct subdirectory mappings. - - Args: - source_path: Parent directory path - - Returns: - List of DirMapping objects for direct children - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - # First get the parent's depth - parent_row = conn.execute( - "SELECT depth, project_id FROM dir_mapping WHERE source_path=?", - (source_path_str,), - ).fetchone() - - if not parent_row: - return [] - - parent_depth = int(parent_row["depth"]) - project_id = int(parent_row["project_id"]) - - # Get all subdirs with depth = parent_depth + 1 and matching path prefix - rows = conn.execute( - """ - SELECT * FROM dir_mapping - WHERE project_id=? AND depth=? AND source_path LIKE ? - ORDER BY source_path - """, - (project_id, parent_depth + 1, f"{source_path_str}%"), - ).fetchall() - - return [self._row_to_dir_mapping(row) for row in rows] - - def update_dir_stats(self, source_path: Path, files_count: int) -> None: - """Update directory statistics. - - Args: - source_path: Source directory path - files_count: Number of files in directory - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - conn.execute( - """ - UPDATE dir_mapping - SET files_count=?, last_updated=? - WHERE source_path=? - """, - (files_count, time.time(), source_path_str), - ) - conn.commit() - - def update_index_paths(self, old_root: Path, new_root: Path) -> int: - """Update all index paths after migration. - - Replaces old_root prefix with new_root in all stored index paths. - - Args: - old_root: Old index root directory - new_root: New index root directory - - Returns: - Number of paths updated - """ - with self._lock: - conn = self._get_connection() - old_root_str = str(old_root.resolve()) - new_root_str = str(new_root.resolve()) - updated = 0 - - # Update projects - conn.execute( - """ - UPDATE projects - SET index_root = REPLACE(index_root, ?, ?) - WHERE index_root LIKE ? - """, - (old_root_str, new_root_str, f"{old_root_str}%"), - ) - updated += conn.total_changes - - # Update dir_mapping - conn.execute( - """ - UPDATE dir_mapping - SET index_path = REPLACE(index_path, ?, ?) - WHERE index_path LIKE ? - """, - (old_root_str, new_root_str, f"{old_root_str}%"), - ) - updated += conn.total_changes - - conn.commit() - return updated - - # === Internal Methods === - - def _row_to_project_info(self, row: sqlite3.Row) -> ProjectInfo: - """Convert database row to ProjectInfo.""" - return ProjectInfo( - id=int(row["id"]), - source_root=Path(row["source_root"]), - index_root=Path(row["index_root"]), - created_at=float(row["created_at"]) if row["created_at"] else 0.0, - last_indexed=float(row["last_indexed"]) if row["last_indexed"] else 0.0, - total_files=int(row["total_files"]) if row["total_files"] else 0, - total_dirs=int(row["total_dirs"]) if row["total_dirs"] else 0, - status=str(row["status"]) if row["status"] else "active", - ) - - def _row_to_dir_mapping(self, row: sqlite3.Row) -> DirMapping: - """Convert database row to DirMapping.""" - return DirMapping( - id=int(row["id"]), - project_id=int(row["project_id"]), - source_path=Path(row["source_path"]), - index_path=Path(row["index_path"]), - depth=int(row["depth"]) if row["depth"] is not None else 0, - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) diff --git a/codex-lens/build/lib/codexlens/storage/splade_index.py b/codex-lens/build/lib/codexlens/storage/splade_index.py deleted file mode 100644 index d090a12d..00000000 --- a/codex-lens/build/lib/codexlens/storage/splade_index.py +++ /dev/null @@ -1,578 +0,0 @@ -"""SPLADE inverted index storage for sparse vector retrieval. - -This module implements SQLite-based inverted index for SPLADE sparse vectors, -enabling efficient sparse retrieval using dot-product scoring. -""" - -from __future__ import annotations - -import logging -import sqlite3 -import threading -import time -from pathlib import Path -from typing import Dict, List, Optional, Tuple - -from codexlens.entities import SearchResult -from codexlens.errors import StorageError - -logger = logging.getLogger(__name__) - - -class SpladeIndex: - """SQLite-based inverted index for SPLADE sparse vectors. - - Stores sparse vectors as posting lists mapping token_id -> (chunk_id, weight). - Supports efficient dot-product retrieval using SQL joins. - """ - - def __init__(self, db_path: Path | str) -> None: - """Initialize SPLADE index. - - Args: - db_path: Path to SQLite database file. - """ - self.db_path = Path(db_path) - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Thread-safe connection management - self._lock = threading.RLock() - self._local = threading.local() - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection. - - Each thread gets its own connection to ensure thread safety. - Connections are stored in thread-local storage. - """ - conn = getattr(self._local, "conn", None) - if conn is None: - # Thread-local connection - each thread has its own - conn = sqlite3.connect( - self.db_path, - timeout=30.0, # Wait up to 30s for locks - check_same_thread=True, # Enforce thread safety - ) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - # Limit mmap to 1GB to avoid OOM on smaller systems - conn.execute("PRAGMA mmap_size=1073741824") - # Increase cache size for better query performance (20MB = -20000 pages) - conn.execute("PRAGMA cache_size=-20000") - self._local.conn = conn - return conn - - def close(self) -> None: - """Close thread-local database connection.""" - with self._lock: - conn = getattr(self._local, "conn", None) - if conn is not None: - conn.close() - self._local.conn = None - - def __enter__(self) -> SpladeIndex: - """Context manager entry.""" - self.create_tables() - return self - - def __exit__(self, exc_type, exc, tb) -> None: - """Context manager exit.""" - self.close() - - def has_index(self) -> bool: - """Check if SPLADE tables exist in database. - - Returns: - True if tables exist, False otherwise. - """ - with self._lock: - conn = self._get_connection() - try: - cursor = conn.execute( - """ - SELECT name FROM sqlite_master - WHERE type='table' AND name='splade_posting_list' - """ - ) - return cursor.fetchone() is not None - except sqlite3.Error as e: - logger.error("Failed to check index existence: %s", e) - return False - - def create_tables(self) -> None: - """Create SPLADE schema if not exists. - - Note: When used with distributed indexes (multiple _index.db files), - the SPLADE database stores chunk IDs from multiple sources. In this case, - foreign key constraints are not enforced to allow cross-database references. - """ - with self._lock: - conn = self._get_connection() - try: - # Inverted index for sparse vectors - # Note: No FOREIGN KEY constraint to support distributed index architecture - # where chunks may come from multiple _index.db files - conn.execute(""" - CREATE TABLE IF NOT EXISTS splade_posting_list ( - token_id INTEGER NOT NULL, - chunk_id INTEGER NOT NULL, - weight REAL NOT NULL, - PRIMARY KEY (token_id, chunk_id) - ) - """) - - # Indexes for efficient lookups - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_splade_by_chunk - ON splade_posting_list(chunk_id) - """) - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_splade_by_token - ON splade_posting_list(token_id) - """) - - # Model metadata - conn.execute(""" - CREATE TABLE IF NOT EXISTS splade_metadata ( - id INTEGER PRIMARY KEY DEFAULT 1, - model_name TEXT NOT NULL, - vocab_size INTEGER NOT NULL, - onnx_path TEXT, - created_at REAL - ) - """) - - # Chunk metadata for self-contained search results - # Stores all chunk info needed to build SearchResult without querying _index.db - conn.execute(""" - CREATE TABLE IF NOT EXISTS splade_chunks ( - id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - metadata TEXT, - source_db TEXT - ) - """) - - conn.commit() - logger.debug("SPLADE schema created successfully") - except sqlite3.Error as e: - raise StorageError( - f"Failed to create SPLADE schema: {e}", - db_path=str(self.db_path), - operation="create_tables" - ) from e - - def add_posting(self, chunk_id: int, sparse_vec: Dict[int, float]) -> None: - """Add a single document to inverted index. - - Args: - chunk_id: Chunk ID (foreign key to semantic_chunks.id). - sparse_vec: Sparse vector as {token_id: weight} mapping. - """ - if not sparse_vec: - logger.warning("Empty sparse vector for chunk_id=%d, skipping", chunk_id) - return - - with self._lock: - conn = self._get_connection() - try: - # Insert all non-zero weights for this chunk - postings = [ - (token_id, chunk_id, weight) - for token_id, weight in sparse_vec.items() - if weight > 0 # Only store non-zero weights - ] - - if postings: - conn.executemany( - """ - INSERT OR REPLACE INTO splade_posting_list - (token_id, chunk_id, weight) - VALUES (?, ?, ?) - """, - postings - ) - conn.commit() - logger.debug( - "Added %d postings for chunk_id=%d", len(postings), chunk_id - ) - except sqlite3.Error as e: - raise StorageError( - f"Failed to add posting for chunk_id={chunk_id}: {e}", - db_path=str(self.db_path), - operation="add_posting" - ) from e - - def add_postings_batch( - self, postings: List[Tuple[int, Dict[int, float]]] - ) -> None: - """Batch insert postings for multiple chunks. - - Args: - postings: List of (chunk_id, sparse_vec) tuples. - """ - if not postings: - return - - with self._lock: - conn = self._get_connection() - try: - # Flatten all postings into single batch - batch_data = [] - for chunk_id, sparse_vec in postings: - for token_id, weight in sparse_vec.items(): - if weight > 0: # Only store non-zero weights - batch_data.append((token_id, chunk_id, weight)) - - if batch_data: - conn.executemany( - """ - INSERT OR REPLACE INTO splade_posting_list - (token_id, chunk_id, weight) - VALUES (?, ?, ?) - """, - batch_data - ) - conn.commit() - logger.debug( - "Batch inserted %d postings for %d chunks", - len(batch_data), - len(postings) - ) - except sqlite3.Error as e: - raise StorageError( - f"Failed to batch insert postings: {e}", - db_path=str(self.db_path), - operation="add_postings_batch" - ) from e - - def add_chunk_metadata( - self, - chunk_id: int, - file_path: str, - content: str, - metadata: Optional[str] = None, - source_db: Optional[str] = None - ) -> None: - """Store chunk metadata for self-contained search results. - - Args: - chunk_id: Global chunk ID. - file_path: Path to source file. - content: Chunk text content. - metadata: JSON metadata string. - source_db: Path to source _index.db. - """ - with self._lock: - conn = self._get_connection() - try: - conn.execute( - """ - INSERT OR REPLACE INTO splade_chunks - (id, file_path, content, metadata, source_db) - VALUES (?, ?, ?, ?, ?) - """, - (chunk_id, file_path, content, metadata, source_db) - ) - conn.commit() - except sqlite3.Error as e: - raise StorageError( - f"Failed to add chunk metadata for chunk_id={chunk_id}: {e}", - db_path=str(self.db_path), - operation="add_chunk_metadata" - ) from e - - def add_chunks_metadata_batch( - self, - chunks: List[Tuple[int, str, str, Optional[str], Optional[str]]] - ) -> None: - """Batch insert chunk metadata. - - Args: - chunks: List of (chunk_id, file_path, content, metadata, source_db) tuples. - """ - if not chunks: - return - - with self._lock: - conn = self._get_connection() - try: - conn.executemany( - """ - INSERT OR REPLACE INTO splade_chunks - (id, file_path, content, metadata, source_db) - VALUES (?, ?, ?, ?, ?) - """, - chunks - ) - conn.commit() - logger.debug("Batch inserted %d chunk metadata records", len(chunks)) - except sqlite3.Error as e: - raise StorageError( - f"Failed to batch insert chunk metadata: {e}", - db_path=str(self.db_path), - operation="add_chunks_metadata_batch" - ) from e - - def get_chunks_by_ids(self, chunk_ids: List[int]) -> List[Dict]: - """Get chunk metadata by IDs. - - Args: - chunk_ids: List of chunk IDs to retrieve. - - Returns: - List of dicts with id, file_path, content, metadata, source_db. - """ - if not chunk_ids: - return [] - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f""" - SELECT id, file_path, content, metadata, source_db - FROM splade_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ).fetchall() - - return [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "source_db": row["source_db"] - } - for row in rows - ] - except sqlite3.Error as e: - logger.error("Failed to get chunks by IDs: %s", e) - return [] - - def remove_chunk(self, chunk_id: int) -> int: - """Remove all postings for a chunk. - - Args: - chunk_id: Chunk ID to remove. - - Returns: - Number of deleted postings. - """ - with self._lock: - conn = self._get_connection() - try: - cursor = conn.execute( - "DELETE FROM splade_posting_list WHERE chunk_id = ?", - (chunk_id,) - ) - conn.commit() - deleted = cursor.rowcount - logger.debug("Removed %d postings for chunk_id=%d", deleted, chunk_id) - return deleted - except sqlite3.Error as e: - raise StorageError( - f"Failed to remove chunk_id={chunk_id}: {e}", - db_path=str(self.db_path), - operation="remove_chunk" - ) from e - - def search( - self, - query_sparse: Dict[int, float], - limit: int = 50, - min_score: float = 0.0, - max_query_terms: int = 64 - ) -> List[Tuple[int, float]]: - """Search for similar chunks using dot-product scoring. - - Implements efficient sparse dot-product via SQL JOIN: - score(q, d) = sum(q[t] * d[t]) for all tokens t - - Args: - query_sparse: Query sparse vector as {token_id: weight}. - limit: Maximum number of results. - min_score: Minimum score threshold. - max_query_terms: Maximum query terms to use (default: 64). - Pruning to top-K terms reduces search time with minimal impact on quality. - Set to 0 or negative to disable pruning (use all terms). - - Returns: - List of (chunk_id, score) tuples, ordered by score descending. - """ - if not query_sparse: - logger.warning("Empty query sparse vector") - return [] - - with self._lock: - conn = self._get_connection() - try: - # Build VALUES clause for query terms - # Each term: (token_id, weight) - query_terms = [ - (token_id, weight) - for token_id, weight in query_sparse.items() - if weight > 0 - ] - - if not query_terms: - logger.warning("No non-zero query terms") - return [] - - # Query pruning: keep only top-K terms by weight - # max_query_terms <= 0 means no limit (use all terms) - if max_query_terms > 0 and len(query_terms) > max_query_terms: - query_terms = sorted(query_terms, key=lambda x: x[1], reverse=True)[:max_query_terms] - logger.debug( - "Query pruned from %d to %d terms", - len(query_sparse), - len(query_terms) - ) - - # Create CTE for query terms using parameterized VALUES - # Build placeholders and params to prevent SQL injection - params = [] - placeholders = [] - for token_id, weight in query_terms: - placeholders.append("(?, ?)") - params.extend([token_id, weight]) - - values_placeholders = ", ".join(placeholders) - - sql = f""" - WITH query_terms(token_id, weight) AS ( - VALUES {values_placeholders} - ) - SELECT - p.chunk_id, - SUM(p.weight * q.weight) as score - FROM splade_posting_list p - INNER JOIN query_terms q ON p.token_id = q.token_id - GROUP BY p.chunk_id - HAVING score >= ? - ORDER BY score DESC - LIMIT ? - """ - - # Append min_score and limit to params - params.extend([min_score, limit]) - rows = conn.execute(sql, params).fetchall() - - results = [(row["chunk_id"], float(row["score"])) for row in rows] - logger.debug( - "SPLADE search: %d query terms, %d results", - len(query_terms), - len(results) - ) - return results - - except sqlite3.Error as e: - raise StorageError( - f"SPLADE search failed: {e}", - db_path=str(self.db_path), - operation="search" - ) from e - - def get_metadata(self) -> Optional[Dict]: - """Get SPLADE model metadata. - - Returns: - Dictionary with model_name, vocab_size, onnx_path, created_at, - or None if not set. - """ - with self._lock: - conn = self._get_connection() - try: - row = conn.execute( - """ - SELECT model_name, vocab_size, onnx_path, created_at - FROM splade_metadata - WHERE id = 1 - """ - ).fetchone() - - if row: - return { - "model_name": row["model_name"], - "vocab_size": row["vocab_size"], - "onnx_path": row["onnx_path"], - "created_at": row["created_at"] - } - return None - except sqlite3.Error as e: - logger.error("Failed to get metadata: %s", e) - return None - - def set_metadata( - self, - model_name: str, - vocab_size: int, - onnx_path: Optional[str] = None - ) -> None: - """Set SPLADE model metadata. - - Args: - model_name: SPLADE model name. - vocab_size: Vocabulary size (typically ~30k for BERT vocab). - onnx_path: Optional path to ONNX model file. - """ - with self._lock: - conn = self._get_connection() - try: - current_time = time.time() - conn.execute( - """ - INSERT OR REPLACE INTO splade_metadata - (id, model_name, vocab_size, onnx_path, created_at) - VALUES (1, ?, ?, ?, ?) - """, - (model_name, vocab_size, onnx_path, current_time) - ) - conn.commit() - logger.info( - "Set SPLADE metadata: model=%s, vocab_size=%d", - model_name, - vocab_size - ) - except sqlite3.Error as e: - raise StorageError( - f"Failed to set metadata: {e}", - db_path=str(self.db_path), - operation="set_metadata" - ) from e - - def get_stats(self) -> Dict: - """Get index statistics. - - Returns: - Dictionary with total_postings, unique_tokens, unique_chunks. - """ - with self._lock: - conn = self._get_connection() - try: - row = conn.execute(""" - SELECT - COUNT(*) as total_postings, - COUNT(DISTINCT token_id) as unique_tokens, - COUNT(DISTINCT chunk_id) as unique_chunks - FROM splade_posting_list - """).fetchone() - - return { - "total_postings": row["total_postings"], - "unique_tokens": row["unique_tokens"], - "unique_chunks": row["unique_chunks"] - } - except sqlite3.Error as e: - logger.error("Failed to get stats: %s", e) - return { - "total_postings": 0, - "unique_tokens": 0, - "unique_chunks": 0 - } diff --git a/codex-lens/build/lib/codexlens/storage/sqlite_store.py b/codex-lens/build/lib/codexlens/storage/sqlite_store.py deleted file mode 100644 index 6945be8a..00000000 --- a/codex-lens/build/lib/codexlens/storage/sqlite_store.py +++ /dev/null @@ -1,976 +0,0 @@ -"""SQLite storage for CodexLens indexing and search.""" - -from __future__ import annotations - -import json -import logging -import sqlite3 -import threading -import time -from dataclasses import asdict -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple - -from codexlens.entities import IndexedFile, SearchResult, Symbol -from codexlens.errors import StorageError - -logger = logging.getLogger(__name__) - - -class SQLiteStore: - """SQLiteStore providing FTS5 search and symbol lookup. - - Implements thread-local connection pooling for improved performance. - """ - - # Maximum number of connections to keep in pool to prevent memory leaks - MAX_POOL_SIZE = 32 - # Idle timeout in seconds (10 minutes) - IDLE_TIMEOUT = 600 - # Periodic cleanup interval in seconds (5 minutes) - CLEANUP_INTERVAL = 300 - - def __init__(self, db_path: str | Path) -> None: - self.db_path = Path(db_path) - self._lock = threading.RLock() - self._local = threading.local() - self._pool_lock = threading.Lock() - # Pool stores (connection, last_access_time) tuples - self._pool: Dict[int, Tuple[sqlite3.Connection, float]] = {} - self._pool_generation = 0 - self._cleanup_timer: threading.Timer | None = None - self._cleanup_stop_event = threading.Event() - self._start_cleanup_timer() - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection.""" - thread_id = threading.get_ident() - current_time = time.time() - - if getattr(self._local, "generation", None) == self._pool_generation: - conn = getattr(self._local, "conn", None) - if conn is not None: - with self._pool_lock: - pool_entry = self._pool.get(thread_id) - if pool_entry is not None: - pooled_conn, _ = pool_entry - self._pool[thread_id] = (pooled_conn, current_time) - self._local.conn = pooled_conn - return pooled_conn - - # Thread-local connection is stale (e.g., cleaned up by timer). - self._local.conn = None - - with self._pool_lock: - pool_entry = self._pool.get(thread_id) - if pool_entry is not None: - conn, _ = pool_entry - # Update last access time - self._pool[thread_id] = (conn, current_time) - else: - # Clean up stale and idle connections if pool is too large - if len(self._pool) >= self.MAX_POOL_SIZE: - self._cleanup_stale_connections() - - conn = sqlite3.connect(self.db_path, check_same_thread=False) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - # Memory-mapped I/O for faster reads (30GB limit) - conn.execute("PRAGMA mmap_size=30000000000") - self._pool[thread_id] = (conn, current_time) - - self._local.conn = conn - self._local.generation = self._pool_generation - return conn - - def _cleanup_stale_connections(self) -> None: - """Remove connections for threads that no longer exist or have been idle too long.""" - current_time = time.time() - # Get list of active thread IDs - active_threads = {t.ident for t in threading.enumerate() if t.ident is not None} - - # Find connections to remove: dead threads or idle timeout exceeded - stale_ids: list[tuple[int, str]] = [] - for tid, (conn, last_access) in list(self._pool.items()): - try: - is_dead_thread = tid not in active_threads - is_idle = (current_time - last_access) > self.IDLE_TIMEOUT - - is_invalid_connection = False - if not is_dead_thread and not is_idle: - try: - conn.execute("SELECT 1").fetchone() - except sqlite3.ProgrammingError: - is_invalid_connection = True - except sqlite3.Error: - is_invalid_connection = True - - if is_invalid_connection: - stale_ids.append((tid, "invalid_connection")) - elif is_dead_thread: - stale_ids.append((tid, "dead_thread")) - elif is_idle: - stale_ids.append((tid, "idle_timeout")) - except Exception: - # Never break cleanup for a single bad entry. - continue - - # Close and remove stale connections - for tid, reason in stale_ids: - try: - conn, _ = self._pool[tid] - conn.close() - except Exception: - pass - del self._pool[tid] - logger.debug("Cleaned SQLiteStore connection for thread_id=%s (%s)", tid, reason) - - def _start_cleanup_timer(self) -> None: - if self.CLEANUP_INTERVAL <= 0: - return - - self._cleanup_stop_event.clear() - - def tick() -> None: - if self._cleanup_stop_event.is_set(): - return - - try: - with self._pool_lock: - self._cleanup_stale_connections() - finally: - with self._pool_lock: - if self._cleanup_stop_event.is_set(): - self._cleanup_timer = None - return - - self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick) - self._cleanup_timer.daemon = True - self._cleanup_timer.start() - - self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick) - self._cleanup_timer.daemon = True - self._cleanup_timer.start() - - def _stop_cleanup_timer(self) -> None: - self._cleanup_stop_event.set() - with self._pool_lock: - if self._cleanup_timer is not None: - self._cleanup_timer.cancel() - self._cleanup_timer = None - - def close(self) -> None: - """Close all pooled connections.""" - with self._lock: - self._stop_cleanup_timer() - with self._pool_lock: - for conn, _ in self._pool.values(): - conn.close() - self._pool.clear() - self._pool_generation += 1 - - if hasattr(self._local, "conn"): - self._local.conn = None - if hasattr(self._local, "generation"): - self._local.generation = self._pool_generation - - def __enter__(self) -> SQLiteStore: - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def execute_query( - self, - sql: str, - params: tuple = (), - allow_writes: bool = False - ) -> List[Dict[str, Any]]: - """Execute a raw SQL query and return results as dictionaries. - - This is the public API for executing custom queries without bypassing - encapsulation via _get_connection(). - - By default, only SELECT queries are allowed. Use allow_writes=True - for trusted internal code that needs to execute other statements. - - Args: - sql: SQL query string with ? placeholders for parameters - params: Tuple of parameter values to bind - allow_writes: If True, allow non-SELECT statements (default False) - - Returns: - List of result rows as dictionaries - - Raises: - StorageError: If query execution fails or validation fails - """ - # Validate query type for security - sql_stripped = sql.strip().upper() - if not allow_writes: - # Only allow SELECT and WITH (for CTEs) statements - if not (sql_stripped.startswith("SELECT") or sql_stripped.startswith("WITH")): - raise StorageError( - "Only SELECT queries are allowed. " - "Use allow_writes=True for trusted internal operations.", - db_path=str(self.db_path), - operation="execute_query", - details={"query_type": sql_stripped.split()[0] if sql_stripped else "EMPTY"} - ) - - try: - conn = self._get_connection() - rows = conn.execute(sql, params).fetchall() - return [dict(row) for row in rows] - except sqlite3.Error as e: - raise StorageError( - f"Query execution failed: {e}", - db_path=str(self.db_path), - operation="execute_query", - details={"error_type": type(e).__name__} - ) from e - - def initialize(self) -> None: - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - self._create_schema(conn) - self._ensure_fts_external_content(conn) - - - def add_file(self, indexed_file: IndexedFile, content: str) -> None: - with self._lock: - conn = self._get_connection() - path = str(Path(indexed_file.path).resolve()) - language = indexed_file.language - mtime = Path(path).stat().st_mtime if Path(path).exists() else None - line_count = content.count(chr(10)) + 1 - - conn.execute( - """ - INSERT INTO files(path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (path, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone() - if not row: - raise StorageError(f"Failed to read file id for {path}") - file_id = int(row["id"]) - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if indexed_file.symbols: - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - [ - (file_id, s.name, s.kind, s.range[0], s.range[1]) - for s in indexed_file.symbols - ], - ) - conn.commit() - - def add_files(self, files_data: List[tuple[IndexedFile, str]]) -> None: - """Add multiple files in a single transaction for better performance. - - Args: - files_data: List of (indexed_file, content) tuples - """ - with self._lock: - conn = self._get_connection() - try: - conn.execute("BEGIN") - - for indexed_file, content in files_data: - path = str(Path(indexed_file.path).resolve()) - language = indexed_file.language - mtime = Path(path).stat().st_mtime if Path(path).exists() else None - line_count = content.count(chr(10)) + 1 - - conn.execute( - """ - INSERT INTO files(path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (path, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone() - if not row: - raise StorageError(f"Failed to read file id for {path}") - file_id = int(row["id"]) - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if indexed_file.symbols: - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - [ - (file_id, s.name, s.kind, s.range[0], s.range[1]) - for s in indexed_file.symbols - ], - ) - - conn.commit() - except Exception as exc: - try: - conn.rollback() - except Exception as rollback_exc: - logger.error( - "Rollback failed after add_files() error (%s): %s", exc, rollback_exc - ) - raise exc.with_traceback(exc.__traceback__) from rollback_exc - raise - - def remove_file(self, path: str | Path) -> bool: - """Remove a file from the index.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - - row = conn.execute( - "SELECT id FROM files WHERE path=?", (resolved_path,) - ).fetchone() - - if not row: - return False - - file_id = int(row["id"]) - conn.execute("DELETE FROM files WHERE id=?", (file_id,)) - conn.commit() - return True - - def file_exists(self, path: str | Path) -> bool: - """Check if a file exists in the index.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - row = conn.execute( - "SELECT 1 FROM files WHERE path=?", (resolved_path,) - ).fetchone() - return row is not None - - def get_file_mtime(self, path: str | Path) -> float | None: - """Get the stored mtime for a file.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - row = conn.execute( - "SELECT mtime FROM files WHERE path=?", (resolved_path,) - ).fetchone() - return float(row["mtime"]) if row and row["mtime"] else None - - - def search_fts(self, query: str, *, limit: int = 20, offset: int = 0) -> List[SearchResult]: - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT rowid, path, bm25(files_fts) AS rank, - snippet(files_fts, 2, '[bold red]', '[/bold red]', "...", 20) AS excerpt - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY rank - LIMIT ? OFFSET ? - """, - (query, limit, offset), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - results: List[SearchResult] = [] - for row in rows: - rank = float(row["rank"]) if row["rank"] is not None else 0.0 - score = abs(rank) if rank < 0 else 0.0 - results.append( - SearchResult( - path=row["path"], - score=score, - excerpt=row["excerpt"], - ) - ) - return results - - def search_files_only( - self, query: str, *, limit: int = 20, offset: int = 0 - ) -> List[str]: - """Search indexed file contents and return only file paths.""" - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT path - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY bm25(files_fts) - LIMIT ? OFFSET ? - """, - (query, limit, offset), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - return [row["path"] for row in rows] - - def search_symbols( - self, name: str, *, kind: Optional[str] = None, limit: int = 50 - ) -> List[Symbol]: - pattern = f"%{name}%" - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT name, kind, start_line, end_line - FROM symbols - WHERE name LIKE ? AND kind=? - ORDER BY name - LIMIT ? - """, - (pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT name, kind, start_line, end_line - FROM symbols - WHERE name LIKE ? - ORDER BY name - LIMIT ? - """, - (pattern, limit), - ).fetchall() - - return [ - Symbol(name=row["name"], kind=row["kind"], range=(row["start_line"], row["end_line"])) - for row in rows - ] - - - def stats(self) -> Dict[str, Any]: - with self._lock: - conn = self._get_connection() - file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"] - symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"] - lang_rows = conn.execute( - "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC" - ).fetchall() - languages = {row["language"]: row["c"] for row in lang_rows} - # Include relationship count if table exists - relationship_count = 0 - try: - rel_row = conn.execute("SELECT COUNT(*) AS c FROM code_relationships").fetchone() - relationship_count = int(rel_row["c"]) if rel_row else 0 - except sqlite3.DatabaseError: - pass - - return { - "files": int(file_count), - "symbols": int(symbol_count), - "relationships": relationship_count, - "languages": languages, - "db_path": str(self.db_path), - } - - - def _connect(self) -> sqlite3.Connection: - """Legacy method for backward compatibility.""" - return self._get_connection() - - def _create_schema(self, conn: sqlite3.Connection) -> None: - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL, - mtime REAL, - line_count INTEGER - ) - """ - ) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind)") - conn.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - # Chunks table for multi-vector storage (cascade retrieval architecture) - # - embedding: Original embedding for backward compatibility - # - embedding_binary: 256-dim binary vector for coarse ranking - # - embedding_dense: 2048-dim dense vector for fine ranking - conn.execute( - """ - CREATE TABLE IF NOT EXISTS chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - embedding_binary BLOB, - embedding_dense BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)") - # Run migration for existing databases - self._migrate_chunks_table(conn) - conn.commit() - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to initialize database schema: {exc}") from exc - - def _ensure_fts_external_content(self, conn: sqlite3.Connection) -> None: - """Ensure files_fts is an FTS5 external-content table (no content duplication).""" - try: - sql_row = conn.execute( - "SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'" - ).fetchone() - sql = str(sql_row["sql"]) if sql_row and sql_row["sql"] else None - - if sql is None: - self._create_external_fts(conn) - conn.commit() - return - - if ( - "content='files'" in sql - or 'content="files"' in sql - or "content=files" in sql - ): - self._create_fts_triggers(conn) - conn.commit() - return - - self._migrate_fts_to_external(conn) - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to ensure FTS schema: {exc}") from exc - - def _create_external_fts(self, conn: sqlite3.Connection) -> None: - conn.execute( - """ - CREATE VIRTUAL TABLE files_fts USING fts5( - path UNINDEXED, - language UNINDEXED, - content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_'" - ) - """ - ) - self._create_fts_triggers(conn) - - def _create_fts_triggers(self, conn: sqlite3.Connection) -> None: - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts(rowid, path, language, content) - VALUES(new.id, new.path, new.language, new.content); - END - """ - ) - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts(files_fts, rowid, path, language, content) - VALUES('delete', old.id, old.path, old.language, old.content); - END - """ - ) - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts(files_fts, rowid, path, language, content) - VALUES('delete', old.id, old.path, old.language, old.content); - INSERT INTO files_fts(rowid, path, language, content) - VALUES(new.id, new.path, new.language, new.content); - END - """ - ) - - def _migrate_fts_to_external(self, conn: sqlite3.Connection) -> None: - """Migrate legacy files_fts (with duplicated content) to external content.""" - try: - conn.execute("BEGIN") - conn.execute("DROP TRIGGER IF EXISTS files_ai") - conn.execute("DROP TRIGGER IF EXISTS files_ad") - conn.execute("DROP TRIGGER IF EXISTS files_au") - - conn.execute("ALTER TABLE files_fts RENAME TO files_fts_legacy") - self._create_external_fts(conn) - conn.execute("INSERT INTO files_fts(files_fts) VALUES('rebuild')") - conn.execute("DROP TABLE files_fts_legacy") - conn.commit() - except sqlite3.DatabaseError as exc: - try: - conn.rollback() - except Exception as rollback_exc: - logger.error( - "Rollback failed during FTS schema migration (%s): %s", exc, rollback_exc - ) - raise exc.with_traceback(exc.__traceback__) from rollback_exc - - try: - conn.execute("DROP TABLE IF EXISTS files_fts") - except Exception: - pass - - try: - conn.execute("ALTER TABLE files_fts_legacy RENAME TO files_fts") - conn.commit() - except Exception: - pass - raise - - try: - conn.execute("VACUUM") - except sqlite3.DatabaseError: - pass - - def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None: - """Migrate existing chunks table to add multi-vector columns if needed. - - This handles upgrading existing databases that may have the chunks table - without the embedding_binary and embedding_dense columns. - """ - # Check if chunks table exists - table_exists = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if not table_exists: - # Table doesn't exist yet, nothing to migrate - return - - # Check existing columns - cursor = conn.execute("PRAGMA table_info(chunks)") - columns = {row[1] for row in cursor.fetchall()} - - # Add embedding_binary column if missing - if "embedding_binary" not in columns: - logger.info("Migrating chunks table: adding embedding_binary column") - conn.execute( - "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB" - ) - - # Add embedding_dense column if missing - if "embedding_dense" not in columns: - logger.info("Migrating chunks table: adding embedding_dense column") - conn.execute( - "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB" - ) - - def add_chunks( - self, - file_path: str, - chunks_data: List[Dict[str, Any]], - *, - embedding: Optional[List[List[float]]] = None, - embedding_binary: Optional[List[bytes]] = None, - embedding_dense: Optional[List[bytes]] = None, - ) -> List[int]: - """Add multiple chunks with multi-vector embeddings support. - - This method supports the cascade retrieval architecture with three embedding types: - - embedding: Original dense embedding for backward compatibility - - embedding_binary: 256-dim binary vector for fast coarse ranking - - embedding_dense: 2048-dim dense vector for precise fine ranking - - Args: - file_path: Path to the source file for all chunks. - chunks_data: List of dicts with 'content' and optional 'metadata' keys. - embedding: Optional list of dense embeddings (one per chunk). - embedding_binary: Optional list of binary embeddings as bytes (one per chunk). - embedding_dense: Optional list of dense embeddings as bytes (one per chunk). - - Returns: - List of inserted chunk IDs. - - Raises: - ValueError: If embedding list lengths don't match chunks_data length. - StorageError: If database operation fails. - """ - if not chunks_data: - return [] - - n_chunks = len(chunks_data) - - # Validate embedding lengths - if embedding is not None and len(embedding) != n_chunks: - raise ValueError( - f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})" - ) - if embedding_binary is not None and len(embedding_binary) != n_chunks: - raise ValueError( - f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})" - ) - if embedding_dense is not None and len(embedding_dense) != n_chunks: - raise ValueError( - f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})" - ) - - # Prepare batch data - batch_data = [] - for i, chunk in enumerate(chunks_data): - content = chunk.get("content", "") - metadata = chunk.get("metadata") - metadata_json = json.dumps(metadata) if metadata else None - - # Convert embeddings to bytes if needed - emb_blob = None - if embedding is not None: - import struct - emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i]) - - emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None - emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None - - batch_data.append(( - file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json - )) - - with self._lock: - conn = self._get_connection() - try: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM chunks").fetchone() - start_id = (row[0] or 0) + 1 - - conn.executemany( - """ - INSERT INTO chunks ( - file_path, content, embedding, embedding_binary, - embedding_dense, metadata - ) - VALUES (?, ?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - - # Calculate inserted IDs - return list(range(start_id, start_id + n_chunks)) - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to add chunks: {exc}", - db_path=str(self.db_path), - operation="add_chunks", - ) from exc - - def get_binary_embeddings( - self, chunk_ids: List[int] - ) -> Dict[int, Optional[bytes]]: - """Get binary embeddings for specified chunk IDs. - - Used for coarse ranking in cascade retrieval architecture. - Binary embeddings (256-dim) enable fast approximate similarity search. - - Args: - chunk_ids: List of chunk IDs to retrieve embeddings for. - - Returns: - Dictionary mapping chunk_id to embedding_binary bytes (or None if not set). - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return {} - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - - return {row["id"]: row["embedding_binary"] for row in rows} - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get binary embeddings: {exc}", - db_path=str(self.db_path), - operation="get_binary_embeddings", - ) from exc - - def get_dense_embeddings( - self, chunk_ids: List[int] - ) -> Dict[int, Optional[bytes]]: - """Get dense embeddings for specified chunk IDs. - - Used for fine ranking in cascade retrieval architecture. - Dense embeddings (2048-dim) provide high-precision similarity scoring. - - Args: - chunk_ids: List of chunk IDs to retrieve embeddings for. - - Returns: - Dictionary mapping chunk_id to embedding_dense bytes (or None if not set). - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return {} - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - - return {row["id"]: row["embedding_dense"] for row in rows} - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get dense embeddings: {exc}", - db_path=str(self.db_path), - operation="get_dense_embeddings", - ) from exc - - def get_chunks_by_ids( - self, chunk_ids: List[int] - ) -> List[Dict[str, Any]]: - """Get chunk data for specified IDs. - - Args: - chunk_ids: List of chunk IDs to retrieve. - - Returns: - List of chunk dictionaries with id, file_path, content, metadata. - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return [] - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f""" - SELECT id, file_path, content, metadata, created_at - FROM chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ).fetchall() - - results = [] - for row in rows: - metadata = None - if row["metadata"]: - try: - metadata = json.loads(row["metadata"]) - except json.JSONDecodeError: - pass - - results.append({ - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": metadata, - "created_at": row["created_at"], - }) - - return results - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get chunks: {exc}", - db_path=str(self.db_path), - operation="get_chunks_by_ids", - ) from exc - - def delete_chunks_by_file(self, file_path: str) -> int: - """Delete all chunks for a given file path. - - Args: - file_path: Path to the source file. - - Returns: - Number of deleted chunks. - - Raises: - StorageError: If database operation fails. - """ - with self._lock: - conn = self._get_connection() - try: - cursor = conn.execute( - "DELETE FROM chunks WHERE file_path = ?", - (file_path,) - ) - conn.commit() - return cursor.rowcount - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to delete chunks: {exc}", - db_path=str(self.db_path), - operation="delete_chunks_by_file", - ) from exc - - def count_chunks(self) -> int: - """Count total chunks in store. - - Returns: - Total number of chunks. - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone() - return int(row["c"]) if row else 0 diff --git a/codex-lens/build/lib/codexlens/storage/sqlite_utils.py b/codex-lens/build/lib/codexlens/storage/sqlite_utils.py deleted file mode 100644 index 2d5730f9..00000000 --- a/codex-lens/build/lib/codexlens/storage/sqlite_utils.py +++ /dev/null @@ -1,64 +0,0 @@ -"""SQLite utility functions for CodexLens storage layer.""" - -from __future__ import annotations - -import logging -import sqlite3 - -log = logging.getLogger(__name__) - - -def check_trigram_support(conn: sqlite3.Connection) -> bool: - """Check if SQLite supports trigram tokenizer for FTS5. - - Trigram tokenizer requires SQLite >= 3.34.0. - - Args: - conn: Database connection to test - - Returns: - True if trigram tokenizer is available, False otherwise - """ - try: - # Test by creating a temporary virtual table with trigram tokenizer - conn.execute( - """ - CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check - USING fts5(test_content, tokenize='trigram') - """ - ) - # Clean up test table - conn.execute("DROP TABLE IF EXISTS test_trigram_check") - conn.commit() - return True - except sqlite3.OperationalError as e: - # Trigram tokenizer not available - if "unrecognized tokenizer" in str(e).lower(): - log.debug("Trigram tokenizer not available in this SQLite version") - return False - # Other operational errors should be re-raised - raise - except Exception: - # Any other exception means trigram is not supported - return False - - -def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]: - """Get SQLite version as (major, minor, patch) tuple. - - Args: - conn: Database connection - - Returns: - Version tuple, e.g., (3, 34, 1) - """ - row = conn.execute("SELECT sqlite_version()").fetchone() - version_str = row[0] if row else "0.0.0" - parts = version_str.split('.') - try: - major = int(parts[0]) if len(parts) > 0 else 0 - minor = int(parts[1]) if len(parts) > 1 else 0 - patch = int(parts[2]) if len(parts) > 2 else 0 - return (major, minor, patch) - except (ValueError, IndexError): - return (0, 0, 0) diff --git a/codex-lens/build/lib/codexlens/storage/vector_meta_store.py b/codex-lens/build/lib/codexlens/storage/vector_meta_store.py deleted file mode 100644 index bd466a60..00000000 --- a/codex-lens/build/lib/codexlens/storage/vector_meta_store.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Central storage for vector metadata. - -This module provides a centralized SQLite database for storing chunk metadata -associated with centralized vector indexes. Instead of traversing all _index.db -files to fetch chunk metadata, this provides O(1) lookup by chunk ID. -""" - -from __future__ import annotations - -import json -import logging -import sqlite3 -import threading -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.errors import StorageError - -logger = logging.getLogger(__name__) - - -class VectorMetadataStore: - """Store and retrieve chunk metadata for centralized vector search. - - This class provides efficient storage and retrieval of chunk metadata - for the centralized vector index architecture. All chunk metadata is - stored in a single _vectors_meta.db file at the project root, enabling - fast lookups without traversing multiple _index.db files. - - Schema: - chunk_metadata: - - chunk_id: INTEGER PRIMARY KEY - Global chunk ID - - file_path: TEXT NOT NULL - Path to source file - - content: TEXT - Chunk text content - - start_line: INTEGER - Start line in source file - - end_line: INTEGER - End line in source file - - category: TEXT - Content category (code/doc) - - metadata: TEXT - JSON-encoded additional metadata - - source_index_db: TEXT - Path to source _index.db file - """ - - def __init__(self, db_path: Path | str) -> None: - """Initialize VectorMetadataStore. - - Args: - db_path: Path to SQLite database file. - """ - self.db_path = Path(db_path) - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Thread-safe connection management - self._lock = threading.RLock() - self._local = threading.local() - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection. - - Each thread gets its own connection to ensure thread safety. - """ - conn = getattr(self._local, "conn", None) - if conn is None: - conn = sqlite3.connect( - str(self.db_path), - timeout=30.0, - check_same_thread=True, - ) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap - self._local.conn = conn - return conn - - def _ensure_schema(self) -> None: - """Create tables if they don't exist.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute(''' - CREATE TABLE IF NOT EXISTS chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT, - start_line INTEGER, - end_line INTEGER, - category TEXT, - metadata TEXT, - source_index_db TEXT - ) - ''') - conn.execute( - 'CREATE INDEX IF NOT EXISTS idx_chunk_file_path ' - 'ON chunk_metadata(file_path)' - ) - conn.execute( - 'CREATE INDEX IF NOT EXISTS idx_chunk_category ' - 'ON chunk_metadata(category)' - ) - # Binary vectors table for cascade search - conn.execute(''' - CREATE TABLE IF NOT EXISTS binary_vectors ( - chunk_id INTEGER PRIMARY KEY, - vector BLOB NOT NULL - ) - ''') - conn.commit() - logger.debug("VectorMetadataStore schema created/verified") - except sqlite3.Error as e: - raise StorageError( - f"Failed to create schema: {e}", - db_path=str(self.db_path), - operation="_ensure_schema" - ) from e - - def add_chunk( - self, - chunk_id: int, - file_path: str, - content: str, - start_line: Optional[int] = None, - end_line: Optional[int] = None, - category: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - source_index_db: Optional[str] = None, - ) -> None: - """Add a single chunk's metadata. - - Args: - chunk_id: Global unique chunk ID. - file_path: Path to source file. - content: Chunk text content. - start_line: Start line in source file. - end_line: End line in source file. - category: Content category (code/doc). - metadata: Additional metadata dictionary. - source_index_db: Path to source _index.db file. - """ - with self._lock: - conn = self._get_connection() - try: - metadata_json = json.dumps(metadata) if metadata else None - conn.execute( - ''' - INSERT OR REPLACE INTO chunk_metadata - (chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''', - (chunk_id, file_path, content, start_line, end_line, - category, metadata_json, source_index_db) - ) - conn.commit() - except sqlite3.Error as e: - raise StorageError( - f"Failed to add chunk {chunk_id}: {e}", - db_path=str(self.db_path), - operation="add_chunk" - ) from e - - def add_chunks(self, chunks: List[Dict[str, Any]]) -> None: - """Batch insert chunk metadata. - - Args: - chunks: List of dictionaries with keys: - - chunk_id (required): Global unique chunk ID - - file_path (required): Path to source file - - content: Chunk text content - - start_line: Start line in source file - - end_line: End line in source file - - category: Content category (code/doc) - - metadata: Additional metadata dictionary - - source_index_db: Path to source _index.db file - """ - if not chunks: - return - - with self._lock: - conn = self._get_connection() - try: - batch_data = [] - for chunk in chunks: - metadata = chunk.get("metadata") - metadata_json = json.dumps(metadata) if metadata else None - batch_data.append(( - chunk["chunk_id"], - chunk["file_path"], - chunk.get("content"), - chunk.get("start_line"), - chunk.get("end_line"), - chunk.get("category"), - metadata_json, - chunk.get("source_index_db"), - )) - - conn.executemany( - ''' - INSERT OR REPLACE INTO chunk_metadata - (chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''', - batch_data - ) - conn.commit() - logger.debug("Batch inserted %d chunk metadata records", len(chunks)) - except sqlite3.Error as e: - raise StorageError( - f"Failed to batch insert chunks: {e}", - db_path=str(self.db_path), - operation="add_chunks" - ) from e - - def get_chunks_by_ids( - self, - chunk_ids: List[int], - category: Optional[str] = None, - ) -> List[Dict[str, Any]]: - """Retrieve chunks by their IDs - the key optimization. - - This is the primary method that replaces traversing all _index.db files. - Provides O(1) lookup by chunk ID instead of O(n) where n is the number - of index databases. - - Args: - chunk_ids: List of chunk IDs to retrieve. - category: Optional category filter ('code' or 'doc'). - - Returns: - List of dictionaries with chunk metadata: - - chunk_id: Global chunk ID - - file_path: Path to source file - - content: Chunk text content - - start_line: Start line in source file - - end_line: End line in source file - - category: Content category - - metadata: Parsed metadata dictionary - - source_index_db: Source _index.db path - """ - if not chunk_ids: - return [] - - # No lock needed for reads: WAL mode + thread-local connections ensure safety - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - - if category: - query = f''' - SELECT chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) AND category = ? - ''' - params = list(chunk_ids) + [category] - else: - query = f''' - SELECT chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) - ''' - params = list(chunk_ids) - - rows = conn.execute(query, params).fetchall() - - results = [] - for row in rows: - metadata = None - if row["metadata"]: - try: - metadata = json.loads(row["metadata"]) - except json.JSONDecodeError: - metadata = {} - - results.append({ - "chunk_id": row["chunk_id"], - "file_path": row["file_path"], - "content": row["content"], - "start_line": row["start_line"], - "end_line": row["end_line"], - "category": row["category"], - "metadata": metadata or {}, - "source_index_db": row["source_index_db"], - }) - - return results - - except sqlite3.Error as e: - logger.error("Failed to get chunks by IDs: %s", e) - return [] - - def get_chunk_count(self) -> int: - """Get total number of chunks in store. - - Returns: - Total chunk count. - """ - # No lock needed for reads: WAL mode + thread-local connections ensure safety - conn = self._get_connection() - try: - row = conn.execute( - "SELECT COUNT(*) FROM chunk_metadata" - ).fetchone() - return row[0] if row else 0 - except sqlite3.Error: - return 0 - - def clear(self) -> None: - """Clear all metadata.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute("DELETE FROM chunk_metadata") - conn.commit() - logger.info("Cleared all chunk metadata") - except sqlite3.Error as e: - raise StorageError( - f"Failed to clear metadata: {e}", - db_path=str(self.db_path), - operation="clear" - ) from e - - def close(self) -> None: - """Close database connection.""" - with self._lock: - conn = getattr(self._local, "conn", None) - if conn is not None: - conn.close() - self._local.conn = None - - def __enter__(self) -> "VectorMetadataStore": - """Context manager entry.""" - self._ensure_schema() - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - """Context manager exit.""" - self.close() - - # ============= Binary Vector Methods for Cascade Search ============= - - def add_binary_vectors( - self, chunk_ids: List[int], binary_vectors: List[bytes] - ) -> None: - """Batch insert binary vectors for cascade search. - - Args: - chunk_ids: List of chunk IDs. - binary_vectors: List of packed binary vectors (as bytes). - """ - if not chunk_ids or len(chunk_ids) != len(binary_vectors): - return - - with self._lock: - conn = self._get_connection() - try: - data = list(zip(chunk_ids, binary_vectors)) - conn.executemany( - "INSERT OR REPLACE INTO binary_vectors (chunk_id, vector) VALUES (?, ?)", - data - ) - conn.commit() - logger.debug("Added %d binary vectors", len(chunk_ids)) - except sqlite3.Error as e: - raise StorageError( - f"Failed to add binary vectors: {e}", - db_path=str(self.db_path), - operation="add_binary_vectors" - ) from e - - def get_all_binary_vectors(self) -> List[tuple]: - """Get all binary vectors for cascade search. - - Returns: - List of (chunk_id, vector_bytes) tuples. - """ - conn = self._get_connection() - try: - rows = conn.execute( - "SELECT chunk_id, vector FROM binary_vectors" - ).fetchall() - return [(row[0], row[1]) for row in rows] - except sqlite3.Error as e: - logger.error("Failed to get binary vectors: %s", e) - return [] - - def get_binary_vector_count(self) -> int: - """Get total number of binary vectors. - - Returns: - Binary vector count. - """ - conn = self._get_connection() - try: - row = conn.execute( - "SELECT COUNT(*) FROM binary_vectors" - ).fetchone() - return row[0] if row else 0 - except sqlite3.Error: - return 0 - - def clear_binary_vectors(self) -> None: - """Clear all binary vectors.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute("DELETE FROM binary_vectors") - conn.commit() - logger.info("Cleared all binary vectors") - except sqlite3.Error as e: - raise StorageError( - f"Failed to clear binary vectors: {e}", - db_path=str(self.db_path), - operation="clear_binary_vectors" - ) from e diff --git a/codex-lens/build/lib/codexlens/watcher/__init__.py b/codex-lens/build/lib/codexlens/watcher/__init__.py deleted file mode 100644 index 4c095ec4..00000000 --- a/codex-lens/build/lib/codexlens/watcher/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""File watcher module for real-time index updates.""" - -from .events import ChangeType, FileEvent, IndexResult, WatcherConfig, WatcherStats -from .file_watcher import FileWatcher -from .incremental_indexer import IncrementalIndexer -from .manager import WatcherManager - -__all__ = [ - "ChangeType", - "FileEvent", - "IndexResult", - "WatcherConfig", - "WatcherStats", - "FileWatcher", - "IncrementalIndexer", - "WatcherManager", -] diff --git a/codex-lens/build/lib/codexlens/watcher/events.py b/codex-lens/build/lib/codexlens/watcher/events.py deleted file mode 100644 index edb43787..00000000 --- a/codex-lens/build/lib/codexlens/watcher/events.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Event types for file watcher.""" - -from __future__ import annotations - -import time -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from typing import List, Optional, Set - - -class ChangeType(Enum): - """Type of file system change.""" - CREATED = "created" - MODIFIED = "modified" - DELETED = "deleted" - MOVED = "moved" - - -@dataclass -class FileEvent: - """A file system change event.""" - path: Path - change_type: ChangeType - timestamp: float - old_path: Optional[Path] = None # For MOVED events - - -@dataclass -class WatcherConfig: - """Configuration for file watcher.""" - debounce_ms: int = 60000 # Default 60 seconds for debounce - ignored_patterns: Set[str] = field(default_factory=lambda: { - # Version control - ".git", ".svn", ".hg", - # Python environments & cache - ".venv", "venv", "env", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache", - # Node.js - "node_modules", "bower_components", ".npm", ".yarn", - # Build artifacts - "dist", "build", "out", "target", "bin", "obj", "_build", "coverage", "htmlcov", - # IDE & Editor - ".idea", ".vscode", ".vs", ".eclipse", - # CodexLens internal - ".codexlens", - # Package manager caches - ".cache", ".parcel-cache", ".turbo", ".next", ".nuxt", - # Logs & temp - "logs", "tmp", "temp", - }) - languages: Optional[List[str]] = None # None = all supported - - -@dataclass -class PendingQueueStatus: - """Status of pending file changes queue.""" - file_count: int = 0 - files: List[str] = field(default_factory=list) # Limited to 20 files - countdown_seconds: int = 0 - last_event_time: Optional[float] = None - - -@dataclass -class IndexResult: - """Result of processing file changes.""" - files_indexed: int = 0 - files_removed: int = 0 - symbols_added: int = 0 - symbols_removed: int = 0 - files_success: List[str] = field(default_factory=list) - files_failed: List[str] = field(default_factory=list) - errors: List[str] = field(default_factory=list) - timestamp: float = field(default_factory=time.time) - - -@dataclass -class WatcherStats: - """Runtime statistics for watcher.""" - files_watched: int = 0 - events_processed: int = 0 - last_event_time: Optional[float] = None - is_running: bool = False diff --git a/codex-lens/build/lib/codexlens/watcher/file_watcher.py b/codex-lens/build/lib/codexlens/watcher/file_watcher.py deleted file mode 100644 index 4fc50691..00000000 --- a/codex-lens/build/lib/codexlens/watcher/file_watcher.py +++ /dev/null @@ -1,347 +0,0 @@ -"""File system watcher using watchdog library.""" - -from __future__ import annotations - -import logging -import threading -import time -from pathlib import Path -from typing import Callable, Dict, List, Optional - -from watchdog.observers import Observer -from watchdog.events import FileSystemEventHandler - -from .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus -from ..config import Config - -logger = logging.getLogger(__name__) - -# Maximum queue size to prevent unbounded memory growth -# When exceeded, forces immediate flush to avoid memory exhaustion -MAX_QUEUE_SIZE = 50000 - - -class _CodexLensHandler(FileSystemEventHandler): - """Internal handler for watchdog events.""" - - def __init__( - self, - watcher: "FileWatcher", - on_event: Callable[[FileEvent], None], - ) -> None: - super().__init__() - self._watcher = watcher - self._on_event = on_event - - def on_created(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.CREATED) - - def on_modified(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.MODIFIED) - - def on_deleted(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.DELETED) - - def on_moved(self, event) -> None: - if event.is_directory: - return - self._emit(event.dest_path, ChangeType.MOVED, old_path=event.src_path) - - def _emit( - self, - path: str, - change_type: ChangeType, - old_path: Optional[str] = None, - ) -> None: - path_obj = Path(path) - - # Filter out files that should not be indexed - if not self._watcher._should_index_file(path_obj): - return - - event = FileEvent( - path=path_obj, - change_type=change_type, - timestamp=time.time(), - old_path=Path(old_path) if old_path else None, - ) - self._on_event(event) - - -class FileWatcher: - """File system watcher for monitoring directory changes. - - Uses watchdog library for cross-platform file system monitoring. - Events are forwarded to the on_changes callback. - - Example: - def handle_changes(events: List[FileEvent]) -> None: - for event in events: - print(f"{event.change_type}: {event.path}") - - watcher = FileWatcher(Path("."), WatcherConfig(), handle_changes) - watcher.start() - watcher.wait() # Block until stopped - """ - - def __init__( - self, - root_path: Path, - config: WatcherConfig, - on_changes: Callable[[List[FileEvent]], None], - ) -> None: - """Initialize file watcher. - - Args: - root_path: Directory to watch recursively - config: Watcher configuration - on_changes: Callback invoked with batched events - """ - self.root_path = Path(root_path).resolve() - self.config = config - self.on_changes = on_changes - - self._observer: Optional[Observer] = None - self._running = False - self._stop_event = threading.Event() - self._lock = threading.RLock() - - # Event queue for batching - self._event_queue: List[FileEvent] = [] - self._queue_lock = threading.Lock() - - # Debounce timer (true debounce - waits after last event) - self._flush_timer: Optional[threading.Timer] = None - self._last_event_time: float = 0 - - # Queue change callbacks for real-time UI updates - self._queue_change_callbacks: List[Callable[[PendingQueueStatus], None]] = [] - - # Config instance for language checking - self._codexlens_config = Config() - - def _should_index_file(self, path: Path) -> bool: - """Check if file should be indexed based on extension and ignore patterns. - - Args: - path: File path to check - - Returns: - True if file should be indexed, False otherwise - """ - # Check against ignore patterns - parts = path.parts - for pattern in self.config.ignored_patterns: - if pattern in parts: - return False - - # Check extension against supported languages - language = self._codexlens_config.language_for_path(path) - return language is not None - - def _on_raw_event(self, event: FileEvent) -> None: - """Handle raw event from watchdog handler with true debounce.""" - force_flush = False - - with self._queue_lock: - # Check queue size limit to prevent memory exhaustion - if len(self._event_queue) >= MAX_QUEUE_SIZE: - logger.warning( - "Event queue limit (%d) reached, forcing immediate flush", - MAX_QUEUE_SIZE - ) - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - force_flush = True - - self._event_queue.append(event) - self._last_event_time = time.time() - - # Cancel previous timer and schedule new one (true debounce) - # Skip if we're about to force flush - if not force_flush: - if self._flush_timer: - self._flush_timer.cancel() - - self._flush_timer = threading.Timer( - self.config.debounce_ms / 1000.0, - self._flush_events - ) - self._flush_timer.daemon = True - self._flush_timer.start() - - # Force flush outside lock to avoid deadlock - if force_flush: - self._flush_events() - - # Notify queue change (outside lock to avoid deadlock) - self._notify_queue_change() - - def _debounce_loop(self) -> None: - """Background thread for checking flush signal file.""" - signal_file = self.root_path / '.codexlens' / 'flush.signal' - while self._running: - time.sleep(1.0) # Check every second - # Check for flush signal file - if signal_file.exists(): - try: - signal_file.unlink() - logger.info("Flush signal detected, triggering immediate index") - self.flush_now() - except Exception as e: - logger.warning("Failed to handle flush signal: %s", e) - - def _flush_events(self) -> None: - """Flush queued events with deduplication.""" - with self._queue_lock: - if not self._event_queue: - return - - # Deduplicate: keep latest event per path - deduped: Dict[Path, FileEvent] = {} - for event in self._event_queue: - deduped[event.path] = event - - events = list(deduped.values()) - self._event_queue.clear() - self._last_event_time = 0 # Reset after flush - - # Notify queue cleared - self._notify_queue_change() - - if events: - try: - self.on_changes(events) - except Exception as exc: - logger.error("Error in on_changes callback: %s", exc) - - def flush_now(self) -> None: - """Immediately flush pending queue (manual trigger).""" - with self._queue_lock: - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - self._flush_events() - - def get_pending_queue_status(self) -> PendingQueueStatus: - """Get current pending queue status for UI display.""" - with self._queue_lock: - file_count = len(self._event_queue) - files = [str(e.path.name) for e in self._event_queue[:20]] - - # Calculate countdown - if self._last_event_time > 0 and file_count > 0: - elapsed = time.time() - self._last_event_time - remaining = max(0, self.config.debounce_ms / 1000.0 - elapsed) - countdown = int(remaining) - else: - countdown = 0 - - return PendingQueueStatus( - file_count=file_count, - files=files, - countdown_seconds=countdown, - last_event_time=self._last_event_time if file_count > 0 else None - ) - - def register_queue_change_callback( - self, callback: Callable[[PendingQueueStatus], None] - ) -> None: - """Register callback for queue change notifications.""" - self._queue_change_callbacks.append(callback) - - def _notify_queue_change(self) -> None: - """Notify all registered callbacks of queue change.""" - status = self.get_pending_queue_status() - for callback in self._queue_change_callbacks: - try: - callback(status) - except Exception as e: - logger.error("Queue change callback error: %s", e) - - def start(self) -> None: - """Start watching the directory. - - Non-blocking. Use wait() to block until stopped. - """ - with self._lock: - if self._running: - logger.warning("Watcher already running") - return - - if not self.root_path.exists(): - raise ValueError(f"Root path does not exist: {self.root_path}") - - self._observer = Observer() - handler = _CodexLensHandler(self, self._on_raw_event) - self._observer.schedule(handler, str(self.root_path), recursive=True) - - self._running = True - self._stop_event.clear() - self._observer.start() - - # Start signal check thread (for flush.signal file) - self._signal_check_thread = threading.Thread( - target=self._debounce_loop, - daemon=True, - name="FileWatcher-SignalCheck", - ) - self._signal_check_thread.start() - - logger.info("Started watching: %s", self.root_path) - - def stop(self) -> None: - """Stop watching the directory. - - Gracefully stops the observer and flushes remaining events. - """ - with self._lock: - if not self._running: - return - - self._running = False - self._stop_event.set() - - # Cancel pending flush timer - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - - if self._observer: - self._observer.stop() - self._observer.join(timeout=5.0) - self._observer = None - - # Wait for signal check thread to finish - if hasattr(self, '_signal_check_thread') and self._signal_check_thread and self._signal_check_thread.is_alive(): - self._signal_check_thread.join(timeout=2.0) - self._signal_check_thread = None - - # Flush any remaining events - self._flush_events() - - logger.info("Stopped watching: %s", self.root_path) - - def wait(self) -> None: - """Block until watcher is stopped. - - Use Ctrl+C or call stop() from another thread to unblock. - """ - try: - while self._running: - self._stop_event.wait(timeout=1.0) - except KeyboardInterrupt: - logger.info("Received interrupt, stopping watcher...") - self.stop() - - @property - def is_running(self) -> bool: - """Check if watcher is currently running.""" - return self._running diff --git a/codex-lens/build/lib/codexlens/watcher/incremental_indexer.py b/codex-lens/build/lib/codexlens/watcher/incremental_indexer.py deleted file mode 100644 index 9991c5fc..00000000 --- a/codex-lens/build/lib/codexlens/watcher/incremental_indexer.py +++ /dev/null @@ -1,369 +0,0 @@ -"""Incremental indexer for processing file changes.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional - -from codexlens.config import Config -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -from .events import ChangeType, FileEvent, IndexResult - -logger = logging.getLogger(__name__) - - -@dataclass -class FileIndexResult: - """Result of indexing a single file.""" - path: Path - symbols_count: int - success: bool - error: Optional[str] = None - - -class IncrementalIndexer: - """Incremental indexer for processing file change events. - - Processes file events (create, modify, delete, move) and updates - the corresponding index databases incrementally. - - Reuses existing infrastructure: - - ParserFactory for symbol extraction - - DirIndexStore for per-directory storage - - GlobalSymbolIndex for cross-file symbols - - PathMapper for source-to-index path conversion - - Example: - indexer = IncrementalIndexer(registry, mapper, config) - result = indexer.process_changes([ - FileEvent(Path("foo.py"), ChangeType.MODIFIED, time.time()), - ]) - print(f"Indexed {result.files_indexed} files") - """ - - def __init__( - self, - registry: RegistryStore, - mapper: PathMapper, - config: Optional[Config] = None, - ) -> None: - """Initialize incremental indexer. - - Args: - registry: Global project registry - mapper: Path mapper for source-to-index conversion - config: CodexLens configuration (uses defaults if None) - """ - self.registry = registry - self.mapper = mapper - self.config = config or Config() - self.parser_factory = ParserFactory(self.config) - - self._global_index: Optional[GlobalSymbolIndex] = None - self._dir_stores: dict[Path, DirIndexStore] = {} - self._lock = __import__("threading").RLock() - - def _get_global_index(self, index_root: Path, source_root: Optional[Path] = None) -> Optional[GlobalSymbolIndex]: - """Get or create global symbol index. - - Args: - index_root: Root directory containing the global symbol index DB - source_root: Source directory root for looking up project_id from registry - """ - if not self.config.global_symbol_index_enabled: - return None - - if self._global_index is None: - global_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - if global_db_path.exists(): - # Get project_id from registry using source_root - project_id = 0 # Default fallback - if source_root: - project_info = self.registry.get_project(source_root) - if project_info: - project_id = project_info.id - self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id) - - return self._global_index - - def _get_dir_store(self, dir_path: Path) -> Optional[DirIndexStore]: - """Get DirIndexStore for a directory, if indexed.""" - with self._lock: - if dir_path in self._dir_stores: - return self._dir_stores[dir_path] - - index_db = self.mapper.source_to_index_db(dir_path) - if not index_db.exists(): - logger.debug("No index found for directory: %s", dir_path) - return None - - # Get index root for global index - source_root = self.mapper.get_project_root(dir_path) or dir_path - index_root = self.mapper.source_to_index_dir(source_root) - global_index = self._get_global_index(index_root, source_root=source_root) - - store = DirIndexStore( - index_db, - config=self.config, - global_index=global_index, - ) - self._dir_stores[dir_path] = store - return store - - def process_changes(self, events: List[FileEvent]) -> IndexResult: - """Process a batch of file change events. - - Args: - events: List of file events to process - - Returns: - IndexResult with statistics - """ - result = IndexResult() - - for event in events: - try: - if event.change_type == ChangeType.CREATED: - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - elif event.change_type == ChangeType.MODIFIED: - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - elif event.change_type == ChangeType.DELETED: - self._remove_file(event.path) - result.files_removed += 1 - - elif event.change_type == ChangeType.MOVED: - # Remove from old location, add at new location - if event.old_path: - self._remove_file(event.old_path) - result.files_removed += 1 - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - except Exception as exc: - error_msg = f"Error processing {event.path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - result.errors.append(error_msg) - - return result - - def _index_file(self, path: Path) -> FileIndexResult: - """Index a single file. - - Args: - path: Path to the file to index - - Returns: - FileIndexResult with status - """ - path = Path(path).resolve() - - # Check if file exists - if not path.exists(): - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"File not found: {path}", - ) - - # Check if language is supported - language = self.config.language_for_path(path) - if not language: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Unsupported language for: {path}", - ) - - # Get directory store - dir_path = path.parent - store = self._get_dir_store(dir_path) - if store is None: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Directory not indexed: {dir_path}", - ) - - # Read file content with fallback encodings - try: - content = path.read_text(encoding="utf-8") - except UnicodeDecodeError: - logger.debug("UTF-8 decode failed for %s, using fallback with errors='ignore'", path) - try: - content = path.read_text(encoding="utf-8", errors="ignore") - except Exception as exc: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Failed to read file: {exc}", - ) - except Exception as exc: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Failed to read file: {exc}", - ) - - # Parse symbols - try: - parser = self.parser_factory.get_parser(language) - indexed_file = parser.parse(content, path) - except Exception as exc: - error_msg = f"Failed to parse {path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - - # Update store with retry logic for transient database errors - max_retries = 3 - for attempt in range(max_retries): - try: - store.add_file( - name=path.name, - full_path=str(path), - content=content, - language=language, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - # Update merkle root - store.update_merkle_root() - - logger.debug("Indexed file: %s (%d symbols)", path, len(indexed_file.symbols)) - - return FileIndexResult( - path=path, - symbols_count=len(indexed_file.symbols), - success=True, - ) - - except __import__("sqlite3").OperationalError as exc: - # Transient database errors (e.g., database locked) - if attempt < max_retries - 1: - import time - wait_time = 0.1 * (2 ** attempt) # Exponential backoff - logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s", - attempt + 1, max_retries, wait_time, exc) - time.sleep(wait_time) - continue - else: - error_msg = f"Failed to store {path} after {max_retries} attempts: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - except Exception as exc: - error_msg = f"Failed to store {path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - - # Should never reach here - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error="Unexpected error in indexing loop", - ) - - def _remove_file(self, path: Path) -> bool: - """Remove a file from the index. - - Args: - path: Path to the file to remove - - Returns: - True if removed successfully - """ - path = Path(path).resolve() - dir_path = path.parent - - store = self._get_dir_store(dir_path) - if store is None: - logger.debug("Cannot remove file, directory not indexed: %s", dir_path) - return False - - # Retry logic for transient database errors - max_retries = 3 - for attempt in range(max_retries): - try: - store.remove_file(str(path)) - store.update_merkle_root() - logger.debug("Removed file from index: %s", path) - return True - - except __import__("sqlite3").OperationalError as exc: - # Transient database errors (e.g., database locked) - if attempt < max_retries - 1: - import time - wait_time = 0.1 * (2 ** attempt) # Exponential backoff - logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s", - attempt + 1, max_retries, wait_time, exc) - time.sleep(wait_time) - continue - else: - logger.error("Failed to remove %s after %d attempts: %s", path, max_retries, exc) - return False - except Exception as exc: - logger.error("Failed to remove %s: %s", path, exc) - return False - - # Should never reach here - return False - - def close(self) -> None: - """Close all open stores.""" - with self._lock: - for store in self._dir_stores.values(): - try: - store.close() - except Exception: - pass - self._dir_stores.clear() - - if self._global_index: - try: - self._global_index.close() - except Exception: - pass - self._global_index = None diff --git a/codex-lens/build/lib/codexlens/watcher/manager.py b/codex-lens/build/lib/codexlens/watcher/manager.py deleted file mode 100644 index 5a5653d4..00000000 --- a/codex-lens/build/lib/codexlens/watcher/manager.py +++ /dev/null @@ -1,255 +0,0 @@ -"""Watcher manager for coordinating file watching and incremental indexing.""" - -from __future__ import annotations - -import json -import logging -import signal -import threading -import time -from pathlib import Path -from typing import Callable, List, Optional - -from codexlens.config import Config -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -from .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats -from .file_watcher import FileWatcher -from .incremental_indexer import IncrementalIndexer - -logger = logging.getLogger(__name__) - - -class WatcherManager: - """High-level manager for file watching and incremental indexing. - - Coordinates FileWatcher and IncrementalIndexer with: - - Lifecycle management (start/stop) - - Signal handling (SIGINT/SIGTERM) - - Statistics tracking - - Graceful shutdown - """ - - def __init__( - self, - root_path: Path, - config: Optional[Config] = None, - watcher_config: Optional[WatcherConfig] = None, - on_indexed: Optional[Callable[[IndexResult], None]] = None, - on_queue_change: Optional[Callable[[PendingQueueStatus], None]] = None, - ) -> None: - self.root_path = Path(root_path).resolve() - self.config = config or Config() - self.watcher_config = watcher_config or WatcherConfig() - self.on_indexed = on_indexed - self.on_queue_change = on_queue_change - - self._registry: Optional[RegistryStore] = None - self._mapper: Optional[PathMapper] = None - self._watcher: Optional[FileWatcher] = None - self._indexer: Optional[IncrementalIndexer] = None - - self._running = False - self._stop_event = threading.Event() - self._lock = threading.RLock() - - # Statistics - self._stats = WatcherStats() - self._original_sigint = None - self._original_sigterm = None - - # Index history for tracking recent results - self._index_history: List[IndexResult] = [] - self._max_history_size = 10 - - def _handle_changes(self, events: List[FileEvent]) -> None: - """Handle file change events from watcher.""" - if not self._indexer or not events: - return - - logger.info("Processing %d file changes", len(events)) - result = self._indexer.process_changes(events) - - # Update stats - self._stats.events_processed += len(events) - self._stats.last_event_time = time.time() - - # Save to history - self._index_history.append(result) - if len(self._index_history) > self._max_history_size: - self._index_history.pop(0) - - if result.files_indexed > 0 or result.files_removed > 0: - logger.info( - "Indexed %d files, removed %d files, %d errors", - result.files_indexed, result.files_removed, len(result.errors) - ) - - # Output JSON for TypeScript backend parsing - result_data = { - "files_indexed": result.files_indexed, - "files_removed": result.files_removed, - "symbols_added": result.symbols_added, - "symbols_removed": result.symbols_removed, - "files_success": result.files_success[:20], # Limit output - "files_failed": result.files_failed[:20], - "errors": result.errors[:10], - "timestamp": result.timestamp - } - print(f"[INDEX_RESULT] {json.dumps(result_data)}", flush=True) - - if self.on_indexed: - try: - self.on_indexed(result) - except Exception as exc: - logger.error("Error in on_indexed callback: %s", exc) - - def _signal_handler(self, signum, frame) -> None: - """Handle shutdown signals.""" - logger.info("Received signal %d, stopping...", signum) - self.stop() - - def _install_signal_handlers(self) -> None: - """Install signal handlers for graceful shutdown.""" - try: - self._original_sigint = signal.signal(signal.SIGINT, self._signal_handler) - if hasattr(signal, 'SIGTERM'): - self._original_sigterm = signal.signal(signal.SIGTERM, self._signal_handler) - except (ValueError, OSError): - # Signal handling not available (e.g., not main thread) - pass - - def _restore_signal_handlers(self) -> None: - """Restore original signal handlers.""" - try: - if self._original_sigint is not None: - signal.signal(signal.SIGINT, self._original_sigint) - if self._original_sigterm is not None and hasattr(signal, 'SIGTERM'): - signal.signal(signal.SIGTERM, self._original_sigterm) - except (ValueError, OSError): - pass - - def start(self) -> None: - """Start watching and indexing.""" - with self._lock: - if self._running: - logger.warning("WatcherManager already running") - return - - # Validate path - if not self.root_path.exists(): - raise ValueError(f"Root path does not exist: {self.root_path}") - - # Initialize components - self._registry = RegistryStore() - self._registry.initialize() - self._mapper = PathMapper() - - self._indexer = IncrementalIndexer( - self._registry, self._mapper, self.config - ) - - self._watcher = FileWatcher( - self.root_path, self.watcher_config, self._handle_changes - ) - - # Always register queue change callback for stdout output (TypeScript backend) - # The wrapper prints [QUEUE_STATUS] JSON and optionally calls on_queue_change - self._watcher.register_queue_change_callback(self._on_queue_change_wrapper) - - # Install signal handlers - self._install_signal_handlers() - - # Start watcher - self._running = True - self._stats.is_running = True - self._stop_event.clear() - self._watcher.start() - - logger.info("WatcherManager started for: %s", self.root_path) - - def stop(self) -> None: - """Stop watching and clean up.""" - with self._lock: - if not self._running: - return - - self._running = False - self._stats.is_running = False - self._stop_event.set() - - # Stop watcher - if self._watcher: - self._watcher.stop() - self._watcher = None - - # Close indexer - if self._indexer: - self._indexer.close() - self._indexer = None - - # Close registry - if self._registry: - self._registry.close() - self._registry = None - - # Restore signal handlers - self._restore_signal_handlers() - - logger.info("WatcherManager stopped") - - def wait(self) -> None: - """Block until stopped.""" - try: - while self._running: - self._stop_event.wait(timeout=1.0) - except KeyboardInterrupt: - logger.info("Interrupted, stopping...") - self.stop() - - @property - def is_running(self) -> bool: - """Check if manager is running.""" - return self._running - - def get_stats(self) -> WatcherStats: - """Get runtime statistics.""" - return WatcherStats( - files_watched=self._stats.files_watched, - events_processed=self._stats.events_processed, - last_event_time=self._stats.last_event_time, - is_running=self._running, - ) - - def _on_queue_change_wrapper(self, status: PendingQueueStatus) -> None: - """Wrapper for queue change callback with JSON output.""" - # Output JSON for TypeScript backend parsing - status_data = { - "file_count": status.file_count, - "files": status.files, - "countdown_seconds": status.countdown_seconds, - "last_event_time": status.last_event_time - } - print(f"[QUEUE_STATUS] {json.dumps(status_data)}", flush=True) - - if self.on_queue_change: - try: - self.on_queue_change(status) - except Exception as exc: - logger.error("Error in on_queue_change callback: %s", exc) - - def flush_now(self) -> None: - """Immediately flush pending queue (manual trigger).""" - if self._watcher: - self._watcher.flush_now() - - def get_pending_queue_status(self) -> Optional[PendingQueueStatus]: - """Get current pending queue status.""" - if self._watcher: - return self._watcher.get_pending_queue_status() - return None - - def get_index_history(self, limit: int = 5) -> List[IndexResult]: - """Get recent index history.""" - return self._index_history[-limit:]