feat: Enhance embedding generation and search capabilities

- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance.
- Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving.
- Introduced SPLADE sparse index generation with improved handling and metadata storage.
- Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index.
- Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance.
- Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference.
- Improved `SpladeIndex` with cache size adjustments for better query performance.
- Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval.
- Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
This commit is contained in:
catlog22
2026-01-02 23:57:55 +08:00
parent 96b44e1482
commit 54fd94547c
12 changed files with 945 additions and 167 deletions

View File

@@ -15,6 +15,7 @@ Requires-Dist: tree-sitter-python>=0.25
Requires-Dist: tree-sitter-javascript>=0.25 Requires-Dist: tree-sitter-javascript>=0.25
Requires-Dist: tree-sitter-typescript>=0.23 Requires-Dist: tree-sitter-typescript>=0.23
Requires-Dist: pathspec>=0.11 Requires-Dist: pathspec>=0.11
Requires-Dist: watchdog>=3.0
Provides-Extra: semantic Provides-Extra: semantic
Requires-Dist: numpy>=1.24; extra == "semantic" Requires-Dist: numpy>=1.24; extra == "semantic"
Requires-Dist: fastembed>=0.2; extra == "semantic" Requires-Dist: fastembed>=0.2; extra == "semantic"
@@ -29,6 +30,26 @@ Requires-Dist: numpy>=1.24; extra == "semantic-directml"
Requires-Dist: fastembed>=0.2; extra == "semantic-directml" Requires-Dist: fastembed>=0.2; extra == "semantic-directml"
Requires-Dist: hnswlib>=0.8.0; extra == "semantic-directml" Requires-Dist: hnswlib>=0.8.0; extra == "semantic-directml"
Requires-Dist: onnxruntime-directml>=1.15.0; extra == "semantic-directml" Requires-Dist: onnxruntime-directml>=1.15.0; extra == "semantic-directml"
Provides-Extra: reranker-onnx
Requires-Dist: optimum>=1.16; extra == "reranker-onnx"
Requires-Dist: onnxruntime>=1.15; extra == "reranker-onnx"
Requires-Dist: transformers>=4.36; extra == "reranker-onnx"
Provides-Extra: reranker-api
Requires-Dist: httpx>=0.25; extra == "reranker-api"
Provides-Extra: reranker-litellm
Requires-Dist: ccw-litellm>=0.1; extra == "reranker-litellm"
Provides-Extra: reranker-legacy
Requires-Dist: sentence-transformers>=2.2; extra == "reranker-legacy"
Provides-Extra: reranker
Requires-Dist: optimum>=1.16; extra == "reranker"
Requires-Dist: onnxruntime>=1.15; extra == "reranker"
Requires-Dist: transformers>=4.36; extra == "reranker"
Provides-Extra: splade
Requires-Dist: transformers>=4.36; extra == "splade"
Requires-Dist: optimum[onnxruntime]>=1.16; extra == "splade"
Provides-Extra: splade-gpu
Requires-Dist: transformers>=4.36; extra == "splade-gpu"
Requires-Dist: optimum[onnxruntime-gpu]>=1.16; extra == "splade-gpu"
Provides-Extra: encoding Provides-Extra: encoding
Requires-Dist: chardet>=5.0; extra == "encoding" Requires-Dist: chardet>=5.0; extra == "encoding"
Provides-Extra: full Provides-Extra: full

View File

@@ -8,6 +8,7 @@ src/codexlens/__init__.py
src/codexlens/__main__.py src/codexlens/__main__.py
src/codexlens/config.py src/codexlens/config.py
src/codexlens/entities.py src/codexlens/entities.py
src/codexlens/env_config.py
src/codexlens/errors.py src/codexlens/errors.py
src/codexlens/cli/__init__.py src/codexlens/cli/__init__.py
src/codexlens/cli/commands.py src/codexlens/cli/commands.py
@@ -15,6 +16,7 @@ src/codexlens/cli/embedding_manager.py
src/codexlens/cli/model_manager.py src/codexlens/cli/model_manager.py
src/codexlens/cli/output.py src/codexlens/cli/output.py
src/codexlens/indexing/__init__.py src/codexlens/indexing/__init__.py
src/codexlens/indexing/embedding.py
src/codexlens/indexing/symbol_extractor.py src/codexlens/indexing/symbol_extractor.py
src/codexlens/parsers/__init__.py src/codexlens/parsers/__init__.py
src/codexlens/parsers/encoding.py src/codexlens/parsers/encoding.py
@@ -24,6 +26,7 @@ src/codexlens/parsers/treesitter_parser.py
src/codexlens/search/__init__.py src/codexlens/search/__init__.py
src/codexlens/search/chain_search.py src/codexlens/search/chain_search.py
src/codexlens/search/enrichment.py src/codexlens/search/enrichment.py
src/codexlens/search/graph_expander.py
src/codexlens/search/hybrid_search.py src/codexlens/search/hybrid_search.py
src/codexlens/search/query_parser.py src/codexlens/search/query_parser.py
src/codexlens/search/ranking.py src/codexlens/search/ranking.py
@@ -37,28 +40,52 @@ src/codexlens/semantic/factory.py
src/codexlens/semantic/gpu_support.py src/codexlens/semantic/gpu_support.py
src/codexlens/semantic/litellm_embedder.py src/codexlens/semantic/litellm_embedder.py
src/codexlens/semantic/rotational_embedder.py src/codexlens/semantic/rotational_embedder.py
src/codexlens/semantic/splade_encoder.py
src/codexlens/semantic/vector_store.py src/codexlens/semantic/vector_store.py
src/codexlens/semantic/reranker/__init__.py
src/codexlens/semantic/reranker/api_reranker.py
src/codexlens/semantic/reranker/base.py
src/codexlens/semantic/reranker/factory.py
src/codexlens/semantic/reranker/legacy.py
src/codexlens/semantic/reranker/litellm_reranker.py
src/codexlens/semantic/reranker/onnx_reranker.py
src/codexlens/storage/__init__.py src/codexlens/storage/__init__.py
src/codexlens/storage/dir_index.py src/codexlens/storage/dir_index.py
src/codexlens/storage/file_cache.py src/codexlens/storage/file_cache.py
src/codexlens/storage/global_index.py src/codexlens/storage/global_index.py
src/codexlens/storage/index_tree.py src/codexlens/storage/index_tree.py
src/codexlens/storage/merkle_tree.py
src/codexlens/storage/migration_manager.py src/codexlens/storage/migration_manager.py
src/codexlens/storage/path_mapper.py src/codexlens/storage/path_mapper.py
src/codexlens/storage/registry.py src/codexlens/storage/registry.py
src/codexlens/storage/splade_index.py
src/codexlens/storage/sqlite_store.py src/codexlens/storage/sqlite_store.py
src/codexlens/storage/sqlite_utils.py src/codexlens/storage/sqlite_utils.py
src/codexlens/storage/vector_meta_store.py
src/codexlens/storage/migrations/__init__.py src/codexlens/storage/migrations/__init__.py
src/codexlens/storage/migrations/migration_001_normalize_keywords.py src/codexlens/storage/migrations/migration_001_normalize_keywords.py
src/codexlens/storage/migrations/migration_002_add_token_metadata.py src/codexlens/storage/migrations/migration_002_add_token_metadata.py
src/codexlens/storage/migrations/migration_004_dual_fts.py src/codexlens/storage/migrations/migration_004_dual_fts.py
src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
src/codexlens/storage/migrations/migration_006_enhance_relationships.py
src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
src/codexlens/storage/migrations/migration_009_add_splade.py
src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
src/codexlens/watcher/__init__.py
src/codexlens/watcher/events.py
src/codexlens/watcher/file_watcher.py
src/codexlens/watcher/incremental_indexer.py
src/codexlens/watcher/manager.py
tests/test_ann_index.py tests/test_ann_index.py
tests/test_api_reranker.py
tests/test_chain_search.py
tests/test_cli_hybrid_search.py tests/test_cli_hybrid_search.py
tests/test_cli_output.py tests/test_cli_output.py
tests/test_code_extractor.py tests/test_code_extractor.py
tests/test_config.py tests/test_config.py
tests/test_dual_fts.py tests/test_dual_fts.py
tests/test_embedder.py
tests/test_embedding_backend_availability.py tests/test_embedding_backend_availability.py
tests/test_encoding.py tests/test_encoding.py
tests/test_enrichment.py tests/test_enrichment.py
@@ -67,15 +94,22 @@ tests/test_errors.py
tests/test_file_cache.py tests/test_file_cache.py
tests/test_global_index.py tests/test_global_index.py
tests/test_global_symbol_index.py tests/test_global_symbol_index.py
tests/test_graph_expansion.py
tests/test_hybrid_chunker.py tests/test_hybrid_chunker.py
tests/test_hybrid_search_e2e.py tests/test_hybrid_search_e2e.py
tests/test_hybrid_search_reranker_backend.py
tests/test_incremental_indexing.py tests/test_incremental_indexing.py
tests/test_litellm_reranker.py
tests/test_merkle_detection.py
tests/test_parser_integration.py tests/test_parser_integration.py
tests/test_parsers.py tests/test_parsers.py
tests/test_performance_optimizations.py tests/test_performance_optimizations.py
tests/test_pure_vector_search.py tests/test_pure_vector_search.py
tests/test_query_parser.py tests/test_query_parser.py
tests/test_recursive_splitting.py tests/test_recursive_splitting.py
tests/test_registry.py
tests/test_reranker_backends.py
tests/test_reranker_factory.py
tests/test_result_grouping.py tests/test_result_grouping.py
tests/test_rrf_fusion.py tests/test_rrf_fusion.py
tests/test_schema_cleanup_migration.py tests/test_schema_cleanup_migration.py
@@ -85,11 +119,14 @@ tests/test_search_full_coverage.py
tests/test_search_performance.py tests/test_search_performance.py
tests/test_semantic.py tests/test_semantic.py
tests/test_semantic_search.py tests/test_semantic_search.py
tests/test_sqlite_store.py
tests/test_storage.py tests/test_storage.py
tests/test_storage_concurrency.py
tests/test_symbol_extractor.py tests/test_symbol_extractor.py
tests/test_token_chunking.py tests/test_token_chunking.py
tests/test_token_storage.py tests/test_token_storage.py
tests/test_tokenizer.py tests/test_tokenizer.py
tests/test_tokenizer_performance.py tests/test_tokenizer_performance.py
tests/test_treesitter_parser.py tests/test_treesitter_parser.py
tests/test_vector_search_full.py tests/test_vector_search_full.py
tests/test_vector_store.py

View File

@@ -6,6 +6,7 @@ tree-sitter-python>=0.25
tree-sitter-javascript>=0.25 tree-sitter-javascript>=0.25
tree-sitter-typescript>=0.23 tree-sitter-typescript>=0.23
pathspec>=0.11 pathspec>=0.11
watchdog>=3.0
[encoding] [encoding]
chardet>=5.0 chardet>=5.0
@@ -13,6 +14,25 @@ chardet>=5.0
[full] [full]
tiktoken>=0.5.0 tiktoken>=0.5.0
[reranker]
optimum>=1.16
onnxruntime>=1.15
transformers>=4.36
[reranker-api]
httpx>=0.25
[reranker-legacy]
sentence-transformers>=2.2
[reranker-litellm]
ccw-litellm>=0.1
[reranker-onnx]
optimum>=1.16
onnxruntime>=1.15
transformers>=4.36
[semantic] [semantic]
numpy>=1.24 numpy>=1.24
fastembed>=0.2 fastembed>=0.2
@@ -29,3 +49,11 @@ numpy>=1.24
fastembed>=0.2 fastembed>=0.2
hnswlib>=0.8.0 hnswlib>=0.8.0
onnxruntime-gpu>=1.15.0 onnxruntime-gpu>=1.15.0
[splade]
transformers>=4.36
optimum[onnxruntime]>=1.16
[splade-gpu]
transformers>=4.36
optimum[onnxruntime-gpu]>=1.16

View File

@@ -36,6 +36,27 @@ from .output import (
app = typer.Typer(help="CodexLens CLI — local code indexing and search.") app = typer.Typer(help="CodexLens CLI — local code indexing and search.")
# Index subcommand group for reorganized commands
index_app = typer.Typer(help="Index management commands (embeddings, SPLADE, migrations).")
app.add_typer(index_app, name="index")
def _deprecated_command_warning(old_name: str, new_name: str) -> None:
"""Display deprecation warning for renamed commands.
Args:
old_name: The old command name being deprecated
new_name: The new command name to use instead
"""
console.print(
f"[yellow]Warning:[/yellow] '{old_name}' is deprecated. "
f"Use '{new_name}' instead."
)
# Index management subcommand group
index_app = typer.Typer(help="Index management commands (init, embeddings, splade, binary, status, migrate, all)")
app.add_typer(index_app, name="index")
def _configure_logging(verbose: bool, json_mode: bool = False) -> None: def _configure_logging(verbose: bool, json_mode: bool = False) -> None:
"""Configure logging level. """Configure logging level.
@@ -96,8 +117,8 @@ def _get_registry_path() -> Path:
return Path.home() / ".codexlens" / "registry.db" return Path.home() / ".codexlens" / "registry.db"
@app.command() @index_app.command("init")
def init( def index_init(
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."),
language: Optional[List[str]] = typer.Option( language: Optional[List[str]] = typer.Option(
None, None,
@@ -108,8 +129,8 @@ def init(
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."), workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."),
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."), force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."), no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
embedding_backend: str = typer.Option("fastembed", "--embedding-backend", help="Embedding backend: fastembed (local) or litellm (remote API)."), backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API)."),
embedding_model: str = typer.Option("code", "--embedding-model", help="Embedding model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small)."), model: str = typer.Option("code", "--model", "-m", help="Embedding model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small)."),
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."), max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
@@ -125,11 +146,11 @@ def init(
If semantic search dependencies are installed, automatically generates embeddings If semantic search dependencies are installed, automatically generates embeddings
after indexing completes. Use --no-embeddings to skip this step. after indexing completes. Use --no-embeddings to skip this step.
Embedding Backend Options: Backend Options (--backend):
- fastembed: Local ONNX-based embeddings (default, no API calls) - fastembed: Local ONNX-based embeddings (default, no API calls)
- litellm: Remote API embeddings via ccw-litellm (requires API keys) - litellm: Remote API embeddings via ccw-litellm (requires API keys)
Embedding Model Options: Model Options (--model):
- For fastembed backend: Use profile names (fast, code, multilingual, balanced) - For fastembed backend: Use profile names (fast, code, multilingual, balanced)
- For litellm backend: Use model names (e.g., text-embedding-3-small, text-embedding-ada-002) - For litellm backend: Use model names (e.g., text-embedding-3-small, text-embedding-ada-002)
""" """
@@ -182,15 +203,15 @@ def init(
# Validate embedding backend # Validate embedding backend
valid_backends = ["fastembed", "litellm"] valid_backends = ["fastembed", "litellm"]
if embedding_backend not in valid_backends: if backend not in valid_backends:
error_msg = f"Invalid embedding backend: {embedding_backend}. Must be one of: {', '.join(valid_backends)}" error_msg = f"Invalid embedding backend: {backend}. Must be one of: {', '.join(valid_backends)}"
if json_mode: if json_mode:
print_json(success=False, error=error_msg) print_json(success=False, error=error_msg)
else: else:
console.print(f"[red]Error:[/red] {error_msg}") console.print(f"[red]Error:[/red] {error_msg}")
raise typer.Exit(code=1) raise typer.Exit(code=1)
backend_available, backend_error = is_embedding_backend_available(embedding_backend) backend_available, backend_error = is_embedding_backend_available(backend)
if backend_available: if backend_available:
# Use the index root directory (not the _index.db file) # Use the index root directory (not the _index.db file)
@@ -198,8 +219,8 @@ def init(
if not json_mode: if not json_mode:
console.print("\n[bold]Generating embeddings...[/bold]") console.print("\n[bold]Generating embeddings...[/bold]")
console.print(f"Backend: [cyan]{embedding_backend}[/cyan]") console.print(f"Backend: [cyan]{backend}[/cyan]")
console.print(f"Model: [cyan]{embedding_model}[/cyan]") console.print(f"Model: [cyan]{model}[/cyan]")
else: else:
# Output progress message for JSON mode (parsed by Node.js) # Output progress message for JSON mode (parsed by Node.js)
print("Generating embeddings...", flush=True) print("Generating embeddings...", flush=True)
@@ -219,8 +240,8 @@ def init(
embed_result = generate_embeddings_recursive( embed_result = generate_embeddings_recursive(
index_root, index_root,
embedding_backend=embedding_backend, embedding_backend=backend,
model_profile=embedding_model, model_profile=model,
force=False, # Don't force regenerate during init force=False, # Don't force regenerate during init
chunk_size=2000, chunk_size=2000,
progress_callback=progress_update, # Always use callback progress_callback=progress_update, # Always use callback
@@ -266,7 +287,7 @@ def init(
} }
else: else:
if not json_mode and verbose: if not json_mode and verbose:
console.print(f"[dim]Embedding backend '{embedding_backend}' not available. Skipping embeddings.[/dim]") console.print(f"[dim]Embedding backend '{backend}' not available. Skipping embeddings.[/dim]")
result["embeddings"] = { result["embeddings"] = {
"generated": False, "generated": False,
"error": backend_error or "Embedding backend not available", "error": backend_error or "Embedding backend not available",
@@ -410,22 +431,20 @@ def watch(
@app.command() @app.command()
def search( def search(
query: str = typer.Argument(..., help="FTS query to run."), query: str = typer.Argument(..., help="Search query."),
path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."), path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."), limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."), depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."), files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."), method: str = typer.Option("hybrid", "--method", "-m", help="Search method: fts, vector, splade, hybrid, cascade."),
use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
weights: Optional[str] = typer.Option( weights: Optional[str] = typer.Option(
None, None,
"--weights", "-w", "--weights", "-w",
help="RRF weights as key=value pairs (e.g., 'splade=0.4,vector=0.6' or 'exact=0.3,fuzzy=0.1,vector=0.6'). Default: auto-detect based on available backends." help="RRF weights as key=value pairs (e.g., 'splade=0.4,vector=0.6' or 'fts=0.4,vector=0.6'). Default: auto-detect based on available backends."
),
use_fts: bool = typer.Option(
False,
"--use-fts",
help="Use FTS (exact+fuzzy) instead of SPLADE for sparse retrieval"
), ),
# Hidden deprecated parameter for backward compatibility
mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None: ) -> None:
@@ -434,64 +453,95 @@ def search(
Uses chain search across directory indexes. Uses chain search across directory indexes.
Use --depth to limit search recursion (0 = current dir only). Use --depth to limit search recursion (0 = current dir only).
Search Modes: Search Methods:
- auto: Auto-detect (hybrid if embeddings exist, exact otherwise) [default] - fts: Full-text search using FTS5 (unicode61 tokenizer). Use --use-fuzzy for typo tolerance.
- exact: Exact FTS using unicode61 tokenizer - for code identifiers - vector: Pure semantic vector search - for natural language queries.
- fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search - splade: SPLADE sparse neural search - semantic term expansion.
- hybrid: RRF fusion of sparse + dense search (recommended) - best recall - hybrid: RRF fusion of sparse + dense search (default) - best recall.
- vector: Vector search with sparse fallback - semantic + keyword - cascade: Two-stage retrieval (binary coarse + dense rerank) - fast + accurate.
- pure-vector: Pure semantic vector search only - natural language queries
SPLADE Mode: Method Selection Guide:
When SPLADE is available (pip install codex-lens[splade]), it automatically - Code identifiers (function/class names): fts
replaces FTS (exact+fuzzy) as the sparse retrieval backend. SPLADE provides - Natural language queries: vector or hybrid
semantic term expansion for better synonym handling. - Typo-tolerant search: fts --use-fuzzy
- Best overall quality: hybrid (default)
Use --use-fts to force FTS mode instead of SPLADE. - Large codebase performance: cascade
Vector Search Requirements: Vector Search Requirements:
Vector search modes require pre-generated embeddings. Vector, hybrid, and cascade methods require pre-generated embeddings.
Use 'codexlens embeddings-generate' to create embeddings first. Use 'codexlens embeddings-generate' to create embeddings first.
Hybrid Mode Weights: Hybrid Mode Weights:
Use --weights to adjust RRF fusion weights: Use --weights to adjust RRF fusion weights:
- SPLADE mode: 'splade=0.4,vector=0.6' (default) - SPLADE mode: 'splade=0.4,vector=0.6' (default)
- FTS mode: 'exact=0.3,fuzzy=0.1,vector=0.6' (default) - FTS mode: 'fts=0.4,vector=0.6' (default)
Legacy format also supported: '0.3,0.1,0.6' (exact,fuzzy,vector)
Examples: Examples:
# Auto-detect mode (uses hybrid if embeddings available) # Default hybrid search
codexlens search "authentication" codexlens search "authentication"
# Explicit exact code search # Exact code identifier search
codexlens search "authenticate_user" --mode exact codexlens search "authenticate_user" --method fts
# Semantic search (requires embeddings) # Typo-tolerant fuzzy search
codexlens search "how to verify user credentials" --mode pure-vector codexlens search "authentcate" --method fts --use-fuzzy
# Force hybrid mode with custom weights # Pure semantic search
codexlens search "authentication" --mode hybrid --weights splade=0.5,vector=0.5 codexlens search "how to verify user credentials" --method vector
# Force FTS instead of SPLADE # SPLADE sparse neural search
codexlens search "authentication" --use-fts codexlens search "user login flow" --method splade
# Fast cascade retrieval for large codebases
codexlens search "authentication" --method cascade
# Hybrid with custom weights
codexlens search "authentication" --method hybrid --weights splade=0.5,vector=0.5
""" """
_configure_logging(verbose, json_mode) _configure_logging(verbose, json_mode)
search_path = path.expanduser().resolve() search_path = path.expanduser().resolve()
# Configure search with FTS fallback if requested
config = Config()
if use_fts:
config.use_fts_fallback = True
# Validate mode # Handle deprecated --mode parameter
valid_modes = ["auto", "exact", "fuzzy", "hybrid", "vector", "pure-vector"] actual_method = method
if mode not in valid_modes: if mode is not None:
if json_mode: # Show deprecation warning
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}") if not json_mode:
console.print("[yellow]Warning: --mode is deprecated, use --method instead.[/yellow]")
# Map old mode values to new method values
mode_to_method = {
"auto": "hybrid",
"exact": "fts",
"fuzzy": "fts", # with use_fuzzy=True
"hybrid": "hybrid",
"vector": "vector",
"pure-vector": "vector",
}
if mode in mode_to_method:
actual_method = mode_to_method[mode]
# Enable fuzzy for old fuzzy mode
if mode == "fuzzy":
use_fuzzy = True
else: else:
console.print(f"[red]Invalid mode:[/red] {mode}") if json_mode:
console.print(f"[dim]Valid modes: {', '.join(valid_modes)}[/dim]") print_json(success=False, error=f"Invalid deprecated mode: {mode}. Use --method instead.")
else:
console.print(f"[red]Invalid deprecated mode:[/red] {mode}")
console.print("[dim]Use --method with: fts, vector, splade, hybrid, cascade[/dim]")
raise typer.Exit(code=1)
# Configure search
config = Config()
# Validate method
valid_methods = ["fts", "vector", "splade", "hybrid", "cascade"]
if actual_method not in valid_methods:
if json_mode:
print_json(success=False, error=f"Invalid method: {actual_method}. Must be one of: {', '.join(valid_methods)}")
else:
console.print(f"[red]Invalid method:[/red] {actual_method}")
console.print(f"[dim]Valid methods: {', '.join(valid_methods)}[/dim]")
raise typer.Exit(code=1) raise typer.Exit(code=1)
# Parse custom weights if provided # Parse custom weights if provided
@@ -557,48 +607,49 @@ def search(
engine = ChainSearchEngine(registry, mapper, config=config) engine = ChainSearchEngine(registry, mapper, config=config)
# Auto-detect mode if set to "auto" # Map method to SearchOptions flags
actual_mode = mode # fts: FTS-only search (optionally with fuzzy)
if mode == "auto": # vector: Pure vector semantic search
# Check if embeddings are available by looking for project in registry # splade: SPLADE sparse neural search
project_record = registry.find_by_source_path(str(search_path)) # hybrid: RRF fusion of sparse + dense
has_embeddings = False # cascade: Two-stage binary + dense retrieval
if actual_method == "fts":
if project_record: hybrid_mode = False
# Check if index has embeddings enable_fuzzy = use_fuzzy
index_path = Path(project_record["index_root"]) / "_index.db" enable_vector = False
try: pure_vector = False
from codexlens.cli.embedding_manager import check_embeddings_status enable_splade = False
embed_status = check_embeddings_status(index_path) enable_cascade = False
if embed_status["success"]: elif actual_method == "vector":
embed_data = embed_status["result"] hybrid_mode = True
has_embeddings = embed_data["has_embeddings"] and embed_data["chunks_count"] > 0 enable_fuzzy = False
except Exception: enable_vector = True
pass pure_vector = True
enable_splade = False
# Choose mode based on embedding availability enable_cascade = False
if has_embeddings: elif actual_method == "splade":
actual_mode = "hybrid" hybrid_mode = True
if not json_mode and verbose: enable_fuzzy = False
console.print("[dim]Auto-detected mode: hybrid (embeddings available)[/dim]") enable_vector = False
else: pure_vector = False
actual_mode = "exact" enable_splade = True
if not json_mode and verbose: enable_cascade = False
console.print("[dim]Auto-detected mode: exact (no embeddings)[/dim]") elif actual_method == "hybrid":
hybrid_mode = True
# Map mode to options enable_fuzzy = use_fuzzy
if actual_mode == "exact": enable_vector = True
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False pure_vector = False
elif actual_mode == "fuzzy": enable_splade = True # SPLADE is preferred sparse in hybrid
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False enable_cascade = False
elif actual_mode == "vector": elif actual_method == "cascade":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False # Vector + exact fallback hybrid_mode = True
elif actual_mode == "pure-vector": enable_fuzzy = False
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True # Pure vector only enable_vector = True
elif actual_mode == "hybrid": pure_vector = False
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False enable_splade = False
enable_cascade = True
else: else:
raise ValueError(f"Invalid mode: {actual_mode}") raise ValueError(f"Invalid method: {actual_method}")
options = SearchOptions( options = SearchOptions(
depth=depth, depth=depth,
@@ -1960,8 +2011,8 @@ def embeddings_status(
console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]") console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]")
@app.command(name="embeddings-generate") @index_app.command("embeddings")
def embeddings_generate( def index_embeddings(
path: Path = typer.Argument( path: Path = typer.Argument(
..., ...,
exists=True, exists=True,
@@ -2000,10 +2051,10 @@ def embeddings_generate(
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
centralized: bool = typer.Option( centralized: bool = typer.Option(
False, True,
"--centralized", "--centralized/--distributed",
"-c", "-c/-d",
help="Use centralized vector storage (single HNSW index at project root).", help="Use centralized vector storage (default) or distributed per-directory indexes.",
), ),
) -> None: ) -> None:
"""Generate semantic embeddings for code search. """Generate semantic embeddings for code search.
@@ -2033,11 +2084,11 @@ def embeddings_generate(
- Any model supported by ccw-litellm - Any model supported by ccw-litellm
Examples: Examples:
codexlens embeddings-generate ~/projects/my-app # Auto-find index (fastembed, code profile) codexlens index embeddings ~/projects/my-app # Auto-find index (fastembed, code profile)
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index codexlens index embeddings ~/.codexlens/indexes/project/_index.db # Specific index
codexlens embeddings-generate ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM codexlens index embeddings ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast profile codexlens index embeddings ~/projects/my-app --model fast --force # Regenerate with fast profile
codexlens embeddings-generate ~/projects/my-app --centralized # Centralized vector storage codexlens index embeddings ~/projects/my-app --centralized # Centralized vector storage
""" """
_configure_logging(verbose, json_mode) _configure_logging(verbose, json_mode)
@@ -2072,25 +2123,20 @@ def embeddings_generate(
index_path = target_path index_path = target_path
index_root = target_path.parent index_root = target_path.parent
elif target_path.is_dir(): elif target_path.is_dir():
# Directory: Try to find index for this project # Directory: Find index location from registry
if centralized: registry = RegistryStore()
# Centralized mode uses directory as root try:
index_root = target_path registry.initialize()
else: mapper = PathMapper()
# Single index mode: find the specific index index_path = mapper.source_to_index_db(target_path)
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(target_path)
if not index_path.exists(): if not index_path.exists():
console.print(f"[red]Error:[/red] No index found for {target_path}") console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index") console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1) raise typer.Exit(code=1)
index_root = index_path.parent index_root = index_path.parent # Use index directory for both modes
finally: finally:
registry.close() registry.close()
else: else:
console.print(f"[red]Error:[/red] Path must be _index.db file or directory") console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
raise typer.Exit(code=1) raise typer.Exit(code=1)
@@ -2442,8 +2488,8 @@ def gpu_reset(
# ==================== SPLADE Commands ==================== # ==================== SPLADE Commands ====================
@app.command("splade-index") @index_app.command("splade")
def splade_index_command( def index_splade(
path: Path = typer.Argument(..., help="Project path to index"), path: Path = typer.Argument(..., help="Project path to index"),
rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"), rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
@@ -2457,8 +2503,8 @@ def splade_index_command(
index directory and builds SPLADE encodings for chunks across all of them. index directory and builds SPLADE encodings for chunks across all of them.
Examples: Examples:
codexlens splade-index ~/projects/my-app codexlens index splade ~/projects/my-app
codexlens splade-index . --rebuild codexlens index splade . --rebuild
""" """
_configure_logging(verbose) _configure_logging(verbose)

View File

@@ -1170,6 +1170,22 @@ def generate_dense_embeddings_centralized(
if progress_callback: if progress_callback:
progress_callback(f"Found {len(index_files)} index databases for centralized embedding") progress_callback(f"Found {len(index_files)} index databases for centralized embedding")
# Pre-calculate estimated chunk count for HNSW capacity
# This avoids expensive resize operations during indexing
estimated_total_files = 0
for index_path in index_files:
try:
with sqlite3.connect(index_path) as conn:
cursor = conn.execute("SELECT COUNT(*) FROM files")
estimated_total_files += cursor.fetchone()[0]
except Exception:
pass
# Heuristic: ~15 chunks per file on average
estimated_chunks = max(100000, estimated_total_files * 15)
if progress_callback:
progress_callback(f"Estimated {estimated_total_files} files, ~{estimated_chunks} chunks")
# Check for existing centralized index # Check for existing centralized index
central_hnsw_path = index_root / VECTORS_HNSW_NAME central_hnsw_path = index_root / VECTORS_HNSW_NAME
if central_hnsw_path.exists() and not force: if central_hnsw_path.exists() and not force:
@@ -1217,11 +1233,12 @@ def generate_dense_embeddings_centralized(
"error": f"Failed to initialize components: {str(e)}", "error": f"Failed to initialize components: {str(e)}",
} }
# Create centralized ANN index # Create centralized ANN index with pre-calculated capacity
# Using estimated_chunks avoids expensive resize operations during indexing
central_ann_index = ANNIndex.create_central( central_ann_index = ANNIndex.create_central(
index_root=index_root, index_root=index_root,
dim=embedder.embedding_dim, dim=embedder.embedding_dim,
initial_capacity=100000, # Larger capacity for centralized index initial_capacity=estimated_chunks,
auto_save=False, auto_save=False,
) )
@@ -1360,6 +1377,148 @@ def generate_dense_embeddings_centralized(
logger.warning("Failed to store vector metadata: %s", e) logger.warning("Failed to store vector metadata: %s", e)
# Non-fatal: continue without centralized metadata # Non-fatal: continue without centralized metadata
# --- Binary Vector Generation for Cascade Search (Memory-Mapped) ---
binary_success = False
binary_count = 0
try:
from codexlens.config import Config, BINARY_VECTORS_MMAP_NAME
config = Config.load()
if getattr(config, 'enable_binary_cascade', True) and all_embeddings:
import numpy as np
if progress_callback:
progress_callback(f"Generating binary vectors for {len(all_embeddings)} chunks...")
# Binarize dense vectors: sign(x) -> 1 if x > 0, 0 otherwise
# Pack into bytes for efficient storage and Hamming distance computation
embeddings_matrix = np.vstack(all_embeddings)
binary_matrix = (embeddings_matrix > 0).astype(np.uint8)
# Pack bits into bytes (8 bits per byte) - vectorized for all rows
packed_matrix = np.packbits(binary_matrix, axis=1)
binary_count = len(packed_matrix)
# Save as memory-mapped file for efficient loading
binary_mmap_path = index_root / BINARY_VECTORS_MMAP_NAME
mmap_array = np.memmap(
str(binary_mmap_path),
dtype=np.uint8,
mode='w+',
shape=packed_matrix.shape
)
mmap_array[:] = packed_matrix
mmap_array.flush()
del mmap_array # Close the memmap
# Save metadata (shape and chunk_ids) to sidecar JSON
import json
meta_path = binary_mmap_path.with_suffix('.meta.json')
with open(meta_path, 'w') as f:
json.dump({
'shape': list(packed_matrix.shape),
'chunk_ids': all_chunk_ids,
'embedding_dim': embeddings_matrix.shape[1],
}, f)
# Also store in DB for backward compatibility
from codexlens.storage.vector_meta_store import VectorMetadataStore
binary_packed_bytes = [row.tobytes() for row in packed_matrix]
with VectorMetadataStore(vectors_meta_path) as meta_store:
meta_store.add_binary_vectors(all_chunk_ids, binary_packed_bytes)
binary_success = True
if progress_callback:
progress_callback(f"Generated {binary_count} binary vectors ({embeddings_matrix.shape[1]} dims -> {packed_matrix.shape[1]} bytes, mmap: {binary_mmap_path.name})")
except Exception as e:
logger.warning("Binary vector generation failed: %s", e)
# Non-fatal: continue without binary vectors
# --- SPLADE Sparse Index Generation (Centralized) ---
splade_success = False
splade_chunks_count = 0
try:
from codexlens.config import Config
config = Config.load()
if config.enable_splade and chunk_id_to_info:
from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder
from codexlens.storage.splade_index import SpladeIndex
import json
ok, err = check_splade_available()
if ok:
if progress_callback:
progress_callback(f"Generating SPLADE sparse vectors for {len(chunk_id_to_info)} chunks...")
# Initialize SPLADE encoder and index
splade_encoder = get_splade_encoder(use_gpu=use_gpu)
splade_db_path = index_root / SPLADE_DB_NAME
splade_index = SpladeIndex(splade_db_path)
splade_index.create_tables()
# Batch encode for efficiency
SPLADE_BATCH_SIZE = 32
all_postings = []
all_chunk_metadata = []
# Create batches from chunk_id_to_info
chunk_items = list(chunk_id_to_info.items())
for i in range(0, len(chunk_items), SPLADE_BATCH_SIZE):
batch_items = chunk_items[i:i + SPLADE_BATCH_SIZE]
chunk_ids = [item[0] for item in batch_items]
chunk_contents = [item[1]["content"] for item in batch_items]
# Generate sparse vectors
sparse_vecs = splade_encoder.encode_batch(chunk_contents, batch_size=SPLADE_BATCH_SIZE)
for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
all_postings.append((cid, sparse_vec))
if progress_callback and (i + SPLADE_BATCH_SIZE) % 100 == 0:
progress_callback(f"SPLADE encoding: {min(i + SPLADE_BATCH_SIZE, len(chunk_items))}/{len(chunk_items)}")
# Batch insert all postings
if all_postings:
splade_index.add_postings_batch(all_postings)
# CRITICAL FIX: Populate splade_chunks table
for cid, info in chunk_id_to_info.items():
metadata_str = json.dumps(info.get("metadata", {})) if info.get("metadata") else None
all_chunk_metadata.append((
cid,
info["file_path"],
info["content"],
metadata_str,
info.get("source_index_db")
))
if all_chunk_metadata:
splade_index.add_chunks_metadata_batch(all_chunk_metadata)
splade_chunks_count = len(all_chunk_metadata)
# Set metadata
splade_index.set_metadata(
model_name=splade_encoder.model_name,
vocab_size=splade_encoder.vocab_size
)
splade_index.close()
splade_success = True
if progress_callback:
progress_callback(f"SPLADE index created: {len(all_postings)} postings, {splade_chunks_count} chunks")
else:
if progress_callback:
progress_callback(f"SPLADE not available, skipping sparse index: {err}")
except Exception as e:
logger.warning("SPLADE encoding failed: %s", e)
if progress_callback:
progress_callback(f"SPLADE encoding failed: {e}")
elapsed_time = time.time() - start_time elapsed_time = time.time() - start_time
# Cleanup # Cleanup
@@ -1380,6 +1539,10 @@ def generate_dense_embeddings_centralized(
"model_name": embedder.model_name, "model_name": embedder.model_name,
"central_index_path": str(central_hnsw_path), "central_index_path": str(central_hnsw_path),
"failed_files": failed_files[:5], "failed_files": failed_files[:5],
"splade_success": splade_success,
"splade_chunks": splade_chunks_count,
"binary_success": binary_success,
"binary_count": binary_count,
}, },
} }

View File

@@ -25,6 +25,7 @@ SPLADE_DB_NAME = "_splade.db"
# Dense vector storage names (centralized storage) # Dense vector storage names (centralized storage)
VECTORS_HNSW_NAME = "_vectors.hnsw" VECTORS_HNSW_NAME = "_vectors.hnsw"
VECTORS_META_DB_NAME = "_vectors_meta.db" VECTORS_META_DB_NAME = "_vectors_meta.db"
BINARY_VECTORS_MMAP_NAME = "_binary_vectors.mmap"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

View File

@@ -0,0 +1,277 @@
"""Binary vector searcher for cascade search.
This module provides fast binary vector search using Hamming distance
for the first stage of cascade search (coarse filtering).
Supports two loading modes:
1. Memory-mapped file (preferred): Low memory footprint, OS-managed paging
2. Database loading (fallback): Loads all vectors into RAM
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import List, Optional, Tuple
import numpy as np
logger = logging.getLogger(__name__)
# Pre-computed popcount lookup table for vectorized Hamming distance
# Each byte value (0-255) maps to its bit count
_POPCOUNT_TABLE = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
class BinarySearcher:
"""Fast binary vector search using Hamming distance.
This class implements the first stage of cascade search:
fast, approximate retrieval using binary vectors and Hamming distance.
The binary vectors are derived from dense embeddings by thresholding:
binary[i] = 1 if dense[i] > 0 else 0
Hamming distance between two binary vectors counts the number of
differing bits, which can be computed very efficiently using XOR
and population count.
Supports two loading modes:
- Memory-mapped file (preferred): Uses np.memmap for minimal RAM usage
- Database (fallback): Loads all vectors into memory from SQLite
"""
def __init__(self, index_root_or_meta_path: Path) -> None:
"""Initialize BinarySearcher.
Args:
index_root_or_meta_path: Either:
- Path to index root directory (containing _binary_vectors.mmap)
- Path to _vectors_meta.db (legacy mode, loads from DB)
"""
path = Path(index_root_or_meta_path)
# Determine if this is an index root or a specific DB path
if path.suffix == '.db':
# Legacy mode: specific DB path
self.index_root = path.parent
self.meta_store_path = path
else:
# New mode: index root directory
self.index_root = path
self.meta_store_path = path / "_vectors_meta.db"
self._chunk_ids: Optional[np.ndarray] = None
self._binary_matrix: Optional[np.ndarray] = None
self._is_memmap = False
self._loaded = False
def load(self) -> bool:
"""Load binary vectors using memory-mapped file or database fallback.
Tries to load from memory-mapped file first (preferred for large indexes),
falls back to database loading if mmap file doesn't exist.
Returns:
True if vectors were loaded successfully.
"""
if self._loaded:
return True
# Try memory-mapped file first (preferred)
mmap_path = self.index_root / "_binary_vectors.mmap"
meta_path = mmap_path.with_suffix('.meta.json')
if mmap_path.exists() and meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta = json.load(f)
shape = tuple(meta['shape'])
self._chunk_ids = np.array(meta['chunk_ids'], dtype=np.int64)
# Memory-map the binary matrix (read-only)
self._binary_matrix = np.memmap(
str(mmap_path),
dtype=np.uint8,
mode='r',
shape=shape
)
self._is_memmap = True
self._loaded = True
logger.info(
"Memory-mapped %d binary vectors (%d bytes each)",
len(self._chunk_ids), shape[1]
)
return True
except Exception as e:
logger.warning("Failed to load mmap binary vectors, falling back to DB: %s", e)
# Fallback: load from database
return self._load_from_db()
def _load_from_db(self) -> bool:
"""Load binary vectors from database (legacy/fallback mode).
Returns:
True if vectors were loaded successfully.
"""
try:
from codexlens.storage.vector_meta_store import VectorMetadataStore
with VectorMetadataStore(self.meta_store_path) as store:
rows = store.get_all_binary_vectors()
if not rows:
logger.warning("No binary vectors found in %s", self.meta_store_path)
return False
# Convert to numpy arrays for fast computation
self._chunk_ids = np.array([r[0] for r in rows], dtype=np.int64)
# Unpack bytes to numpy array
binary_arrays = []
for _, vec_bytes in rows:
arr = np.frombuffer(vec_bytes, dtype=np.uint8)
binary_arrays.append(arr)
self._binary_matrix = np.vstack(binary_arrays)
self._is_memmap = False
self._loaded = True
logger.info(
"Loaded %d binary vectors from DB (%d bytes each)",
len(self._chunk_ids), self._binary_matrix.shape[1]
)
return True
except Exception as e:
logger.error("Failed to load binary vectors: %s", e)
return False
def search(
self,
query_vector: np.ndarray,
top_k: int = 100
) -> List[Tuple[int, int]]:
"""Search for similar vectors using Hamming distance.
Args:
query_vector: Dense query vector (will be binarized).
top_k: Number of top results to return.
Returns:
List of (chunk_id, hamming_distance) tuples sorted by distance.
"""
if not self._loaded and not self.load():
return []
# Binarize query vector
query_binary = (query_vector > 0).astype(np.uint8)
query_packed = np.packbits(query_binary)
# Compute Hamming distances using XOR and popcount
# XOR gives 1 for differing bits
xor_result = np.bitwise_xor(self._binary_matrix, query_packed)
# Vectorized popcount using lookup table (orders of magnitude faster)
# Sum the bit counts for each byte across all columns
distances = np.sum(_POPCOUNT_TABLE[xor_result], axis=1, dtype=np.int32)
# Get top-k with smallest distances
if top_k >= len(distances):
top_indices = np.argsort(distances)
else:
# Partial sort for efficiency
top_indices = np.argpartition(distances, top_k)[:top_k]
top_indices = top_indices[np.argsort(distances[top_indices])]
results = [
(int(self._chunk_ids[i]), int(distances[i]))
for i in top_indices
]
return results
def search_with_rerank(
self,
query_dense: np.ndarray,
dense_vectors: np.ndarray,
dense_chunk_ids: np.ndarray,
top_k: int = 10,
candidates: int = 100
) -> List[Tuple[int, float]]:
"""Two-stage cascade search: binary filter + dense rerank.
Args:
query_dense: Dense query vector.
dense_vectors: Dense vectors for reranking (from HNSW or stored).
dense_chunk_ids: Chunk IDs corresponding to dense_vectors.
top_k: Final number of results.
candidates: Number of candidates from binary search.
Returns:
List of (chunk_id, cosine_similarity) tuples.
"""
# Stage 1: Binary filtering
binary_results = self.search(query_dense, top_k=candidates)
if not binary_results:
return []
candidate_ids = {r[0] for r in binary_results}
# Stage 2: Dense reranking
# Find indices of candidates in dense_vectors
candidate_mask = np.isin(dense_chunk_ids, list(candidate_ids))
candidate_indices = np.where(candidate_mask)[0]
if len(candidate_indices) == 0:
# Fallback: return binary results with normalized distance
max_dist = max(r[1] for r in binary_results) if binary_results else 1
return [(r[0], 1.0 - r[1] / max_dist) for r in binary_results[:top_k]]
# Compute cosine similarities for candidates
candidate_vectors = dense_vectors[candidate_indices]
candidate_ids_array = dense_chunk_ids[candidate_indices]
# Normalize vectors
query_norm = query_dense / (np.linalg.norm(query_dense) + 1e-8)
cand_norms = candidate_vectors / (
np.linalg.norm(candidate_vectors, axis=1, keepdims=True) + 1e-8
)
# Cosine similarities
similarities = np.dot(cand_norms, query_norm)
# Sort by similarity (descending)
sorted_indices = np.argsort(-similarities)[:top_k]
results = [
(int(candidate_ids_array[i]), float(similarities[i]))
for i in sorted_indices
]
return results
@property
def vector_count(self) -> int:
"""Get number of loaded binary vectors."""
return len(self._chunk_ids) if self._chunk_ids is not None else 0
@property
def is_memmap(self) -> bool:
"""Check if using memory-mapped file (vs in-memory array)."""
return self._is_memmap
def clear(self) -> None:
"""Clear loaded vectors from memory."""
# For memmap, just delete the reference (OS will handle cleanup)
if self._is_memmap and self._binary_matrix is not None:
del self._binary_matrix
self._chunk_ids = None
self._binary_matrix = None
self._is_memmap = False
self._loaded = False

View File

@@ -541,26 +541,55 @@ class ChainSearchEngine:
) )
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options) return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
# Search all indexes for binary candidates # Try centralized BinarySearcher first (preferred for mmap indexes)
# The index root is the parent of the first index path
index_root = index_paths[0].parent if index_paths else None
all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path) all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path)
used_centralized = False
for index_path in index_paths: if index_root:
try: centralized_searcher = self._get_centralized_binary_searcher(index_root)
# Get or create binary index for this path if centralized_searcher is not None:
binary_index = self._get_or_create_binary_index(index_path) try:
if binary_index is None or binary_index.count() == 0: # BinarySearcher expects dense vector, not packed binary
continue from codexlens.semantic.embedder import Embedder
embedder = Embedder()
query_dense = embedder.embed_to_numpy([query])[0]
# Search binary index # Centralized search - returns (chunk_id, hamming_distance) tuples
ids, distances = binary_index.search(query_binary_packed, coarse_k) results = centralized_searcher.search(query_dense, top_k=coarse_k)
for chunk_id, dist in zip(ids, distances): for chunk_id, dist in results:
all_candidates.append((chunk_id, dist, index_path)) all_candidates.append((chunk_id, dist, index_root))
used_centralized = True
self.logger.debug(
"Centralized binary search found %d candidates", len(results)
)
except Exception as exc:
self.logger.debug(
"Centralized binary search failed: %s, falling back to per-directory",
exc
)
centralized_searcher.clear()
except Exception as exc: # Fallback: Search per-directory indexes with legacy BinaryANNIndex
self.logger.debug( if not used_centralized:
"Binary search failed for %s: %s", index_path, exc for index_path in index_paths:
) try:
stats.errors.append(f"Binary search failed for {index_path}: {exc}") # Get or create binary index for this path (uses deprecated BinaryANNIndex)
binary_index = self._get_or_create_binary_index(index_path)
if binary_index is None or binary_index.count() == 0:
continue
# Search binary index
ids, distances = binary_index.search(query_binary_packed, coarse_k)
for chunk_id, dist in zip(ids, distances):
all_candidates.append((chunk_id, dist, index_path))
except Exception as exc:
self.logger.debug(
"Binary search failed for %s: %s", index_path, exc
)
stats.errors.append(f"Binary search failed for {index_path}: {exc}")
if not all_candidates: if not all_candidates:
self.logger.debug("No binary candidates found, falling back to hybrid") self.logger.debug("No binary candidates found, falling back to hybrid")
@@ -743,6 +772,10 @@ class ChainSearchEngine:
def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]: def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]:
"""Get or create a BinaryANNIndex for the given index path. """Get or create a BinaryANNIndex for the given index path.
.. deprecated::
This method uses the deprecated BinaryANNIndex. For centralized indexes,
use _get_centralized_binary_searcher() instead.
Attempts to load an existing binary index from disk. If not found, Attempts to load an existing binary index from disk. If not found,
returns None (binary index should be built during indexing). returns None (binary index should be built during indexing).
@@ -753,16 +786,48 @@ class ChainSearchEngine:
BinaryANNIndex instance or None if not available BinaryANNIndex instance or None if not available
""" """
try: try:
from codexlens.semantic.ann_index import BinaryANNIndex import warnings
# Suppress deprecation warning since we're using it intentionally for legacy support
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
from codexlens.semantic.ann_index import BinaryANNIndex
binary_index = BinaryANNIndex(index_path, dim=256) binary_index = BinaryANNIndex(index_path, dim=256)
if binary_index.load(): if binary_index.load():
return binary_index return binary_index
return None return None
except Exception as exc: except Exception as exc:
self.logger.debug("Failed to load binary index for %s: %s", index_path, exc) self.logger.debug("Failed to load binary index for %s: %s", index_path, exc)
return None return None
def _get_centralized_binary_searcher(self, index_root: Path) -> Optional[Any]:
"""Get centralized BinarySearcher for memory-mapped binary vectors.
This is the preferred method for centralized indexes, providing faster
search via memory-mapped files.
Args:
index_root: Root directory containing centralized index files
Returns:
BinarySearcher instance or None if not available
"""
try:
from codexlens.search.binary_searcher import BinarySearcher
binary_searcher = BinarySearcher(index_root)
if binary_searcher.load():
self.logger.debug(
"Using centralized BinarySearcher with %d vectors (mmap=%s)",
binary_searcher.vector_count,
binary_searcher.is_memmap
)
return binary_searcher
return None
except Exception as exc:
self.logger.debug("Failed to load centralized binary searcher: %s", exc)
return None
def _compute_cosine_similarity( def _compute_cosine_similarity(
self, self,
query_vec: "np.ndarray", query_vec: "np.ndarray",

View File

@@ -508,6 +508,10 @@ class ANNIndex:
class BinaryANNIndex: class BinaryANNIndex:
"""Binary vector ANN index using Hamming distance for fast coarse retrieval. """Binary vector ANN index using Hamming distance for fast coarse retrieval.
.. deprecated::
This class is deprecated. Use :class:`codexlens.search.binary_searcher.BinarySearcher`
instead, which provides faster memory-mapped search with centralized storage.
Optimized for binary vectors (256-bit / 32 bytes per vector). Optimized for binary vectors (256-bit / 32 bytes per vector).
Uses packed binary representation for memory efficiency. Uses packed binary representation for memory efficiency.
@@ -553,6 +557,14 @@ class BinaryANNIndex:
"Install with: pip install codexlens[semantic]" "Install with: pip install codexlens[semantic]"
) )
import warnings
warnings.warn(
"BinaryANNIndex is deprecated. Use codexlens.search.binary_searcher.BinarySearcher "
"instead for faster memory-mapped search with centralized storage.",
DeprecationWarning,
stacklevel=2
)
if dim <= 0 or dim % 8 != 0: if dim <= 0 or dim % 8 != 0:
raise ValueError( raise ValueError(
f"Invalid dimension: {dim}. Must be positive and divisible by 8." f"Invalid dimension: {dim}. Must be positive and divisible by 8."

View File

@@ -220,12 +220,16 @@ class SpladeEncoder:
from transformers import AutoTokenizer from transformers import AutoTokenizer
if self.providers is None: if self.providers is None:
from .gpu_support import get_optimal_providers from .gpu_support import get_optimal_providers, get_selected_device_id
# Include device_id options for DirectML/CUDA selection when available # Get providers as pure string list (cache-friendly)
# NOTE: with_device_options=False to avoid tuple-based providers
# which break optimum's caching mechanism
self.providers = get_optimal_providers( self.providers = get_optimal_providers(
use_gpu=self.use_gpu, with_device_options=True use_gpu=self.use_gpu, with_device_options=False
) )
# Get device_id separately for provider_options
self._device_id = get_selected_device_id() if self.use_gpu else None
# Some Optimum versions accept `providers`, others accept a single `provider` # Some Optimum versions accept `providers`, others accept a single `provider`
# Prefer passing the full providers list, with a conservative fallback # Prefer passing the full providers list, with a conservative fallback
@@ -234,6 +238,15 @@ class SpladeEncoder:
params = signature(ORTModelForMaskedLM.from_pretrained).parameters params = signature(ORTModelForMaskedLM.from_pretrained).parameters
if "providers" in params: if "providers" in params:
model_kwargs["providers"] = self.providers model_kwargs["providers"] = self.providers
# Pass device_id via provider_options for GPU selection
if "provider_options" in params and hasattr(self, '_device_id') and self._device_id is not None:
# Build provider_options dict for each GPU provider
provider_options = {}
for p in self.providers:
if p in ("DmlExecutionProvider", "CUDAExecutionProvider", "ROCMExecutionProvider"):
provider_options[p] = {"device_id": self._device_id}
if provider_options:
model_kwargs["provider_options"] = provider_options
elif "provider" in params: elif "provider" in params:
provider_name = "CPUExecutionProvider" provider_name = "CPUExecutionProvider"
if self.providers: if self.providers:
@@ -369,6 +382,21 @@ class SpladeEncoder:
return sparse_dict return sparse_dict
def warmup(self, text: str = "warmup query") -> None:
"""Warmup the encoder by running a dummy inference.
First-time model inference includes initialization overhead.
Call this method once before the first real search to avoid
latency spikes.
Args:
text: Dummy text for warmup (default: "warmup query")
"""
logger.info("Warming up SPLADE encoder...")
# Trigger model loading and first inference
_ = self.encode_text(text)
logger.info("SPLADE encoder warmup complete")
def encode_text(self, text: str) -> Dict[int, float]: def encode_text(self, text: str) -> Dict[int, float]:
"""Encode text to sparse vector {token_id: weight}. """Encode text to sparse vector {token_id: weight}.

View File

@@ -59,6 +59,8 @@ class SpladeIndex:
conn.execute("PRAGMA foreign_keys=ON") conn.execute("PRAGMA foreign_keys=ON")
# Limit mmap to 1GB to avoid OOM on smaller systems # Limit mmap to 1GB to avoid OOM on smaller systems
conn.execute("PRAGMA mmap_size=1073741824") conn.execute("PRAGMA mmap_size=1073741824")
# Increase cache size for better query performance (20MB = -20000 pages)
conn.execute("PRAGMA cache_size=-20000")
self._local.conn = conn self._local.conn = conn
return conn return conn
@@ -385,25 +387,29 @@ class SpladeIndex:
self, self,
query_sparse: Dict[int, float], query_sparse: Dict[int, float],
limit: int = 50, limit: int = 50,
min_score: float = 0.0 min_score: float = 0.0,
max_query_terms: int = 64
) -> List[Tuple[int, float]]: ) -> List[Tuple[int, float]]:
"""Search for similar chunks using dot-product scoring. """Search for similar chunks using dot-product scoring.
Implements efficient sparse dot-product via SQL JOIN: Implements efficient sparse dot-product via SQL JOIN:
score(q, d) = sum(q[t] * d[t]) for all tokens t score(q, d) = sum(q[t] * d[t]) for all tokens t
Args: Args:
query_sparse: Query sparse vector as {token_id: weight}. query_sparse: Query sparse vector as {token_id: weight}.
limit: Maximum number of results. limit: Maximum number of results.
min_score: Minimum score threshold. min_score: Minimum score threshold.
max_query_terms: Maximum query terms to use (default: 64).
Pruning to top-K terms reduces search time with minimal impact on quality.
Set to 0 or negative to disable pruning (use all terms).
Returns: Returns:
List of (chunk_id, score) tuples, ordered by score descending. List of (chunk_id, score) tuples, ordered by score descending.
""" """
if not query_sparse: if not query_sparse:
logger.warning("Empty query sparse vector") logger.warning("Empty query sparse vector")
return [] return []
with self._lock: with self._lock:
conn = self._get_connection() conn = self._get_connection()
try: try:
@@ -414,10 +420,20 @@ class SpladeIndex:
for token_id, weight in query_sparse.items() for token_id, weight in query_sparse.items()
if weight > 0 if weight > 0
] ]
if not query_terms: if not query_terms:
logger.warning("No non-zero query terms") logger.warning("No non-zero query terms")
return [] return []
# Query pruning: keep only top-K terms by weight
# max_query_terms <= 0 means no limit (use all terms)
if max_query_terms > 0 and len(query_terms) > max_query_terms:
query_terms = sorted(query_terms, key=lambda x: x[1], reverse=True)[:max_query_terms]
logger.debug(
"Query pruned from %d to %d terms",
len(query_sparse),
len(query_terms)
)
# Create CTE for query terms using parameterized VALUES # Create CTE for query terms using parameterized VALUES
# Build placeholders and params to prevent SQL injection # Build placeholders and params to prevent SQL injection

View File

@@ -96,6 +96,13 @@ class VectorMetadataStore:
'CREATE INDEX IF NOT EXISTS idx_chunk_category ' 'CREATE INDEX IF NOT EXISTS idx_chunk_category '
'ON chunk_metadata(category)' 'ON chunk_metadata(category)'
) )
# Binary vectors table for cascade search
conn.execute('''
CREATE TABLE IF NOT EXISTS binary_vectors (
chunk_id INTEGER PRIMARY KEY,
vector BLOB NOT NULL
)
''')
conn.commit() conn.commit()
logger.debug("VectorMetadataStore schema created/verified") logger.debug("VectorMetadataStore schema created/verified")
except sqlite3.Error as e: except sqlite3.Error as e:
@@ -329,3 +336,80 @@ class VectorMetadataStore:
def __exit__(self, exc_type, exc_val, exc_tb) -> None: def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit.""" """Context manager exit."""
self.close() self.close()
# ============= Binary Vector Methods for Cascade Search =============
def add_binary_vectors(
self, chunk_ids: List[int], binary_vectors: List[bytes]
) -> None:
"""Batch insert binary vectors for cascade search.
Args:
chunk_ids: List of chunk IDs.
binary_vectors: List of packed binary vectors (as bytes).
"""
if not chunk_ids or len(chunk_ids) != len(binary_vectors):
return
with self._lock:
conn = self._get_connection()
try:
data = list(zip(chunk_ids, binary_vectors))
conn.executemany(
"INSERT OR REPLACE INTO binary_vectors (chunk_id, vector) VALUES (?, ?)",
data
)
conn.commit()
logger.debug("Added %d binary vectors", len(chunk_ids))
except sqlite3.Error as e:
raise StorageError(
f"Failed to add binary vectors: {e}",
db_path=str(self.db_path),
operation="add_binary_vectors"
) from e
def get_all_binary_vectors(self) -> List[tuple]:
"""Get all binary vectors for cascade search.
Returns:
List of (chunk_id, vector_bytes) tuples.
"""
conn = self._get_connection()
try:
rows = conn.execute(
"SELECT chunk_id, vector FROM binary_vectors"
).fetchall()
return [(row[0], row[1]) for row in rows]
except sqlite3.Error as e:
logger.error("Failed to get binary vectors: %s", e)
return []
def get_binary_vector_count(self) -> int:
"""Get total number of binary vectors.
Returns:
Binary vector count.
"""
conn = self._get_connection()
try:
row = conn.execute(
"SELECT COUNT(*) FROM binary_vectors"
).fetchone()
return row[0] if row else 0
except sqlite3.Error:
return 0
def clear_binary_vectors(self) -> None:
"""Clear all binary vectors."""
with self._lock:
conn = self._get_connection()
try:
conn.execute("DELETE FROM binary_vectors")
conn.commit()
logger.info("Cleared all binary vectors")
except sqlite3.Error as e:
raise StorageError(
f"Failed to clear binary vectors: {e}",
db_path=str(self.db_path),
operation="clear_binary_vectors"
) from e