feat: Enhance embedding generation and search capabilities

- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance.
- Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving.
- Introduced SPLADE sparse index generation with improved handling and metadata storage.
- Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index.
- Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance.
- Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference.
- Improved `SpladeIndex` with cache size adjustments for better query performance.
- Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval.
- Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
This commit is contained in:
catlog22
2026-01-02 23:57:55 +08:00
parent 96b44e1482
commit 54fd94547c
12 changed files with 945 additions and 167 deletions

View File

@@ -15,6 +15,7 @@ Requires-Dist: tree-sitter-python>=0.25
Requires-Dist: tree-sitter-javascript>=0.25
Requires-Dist: tree-sitter-typescript>=0.23
Requires-Dist: pathspec>=0.11
Requires-Dist: watchdog>=3.0
Provides-Extra: semantic
Requires-Dist: numpy>=1.24; extra == "semantic"
Requires-Dist: fastembed>=0.2; extra == "semantic"
@@ -29,6 +30,26 @@ Requires-Dist: numpy>=1.24; extra == "semantic-directml"
Requires-Dist: fastembed>=0.2; extra == "semantic-directml"
Requires-Dist: hnswlib>=0.8.0; extra == "semantic-directml"
Requires-Dist: onnxruntime-directml>=1.15.0; extra == "semantic-directml"
Provides-Extra: reranker-onnx
Requires-Dist: optimum>=1.16; extra == "reranker-onnx"
Requires-Dist: onnxruntime>=1.15; extra == "reranker-onnx"
Requires-Dist: transformers>=4.36; extra == "reranker-onnx"
Provides-Extra: reranker-api
Requires-Dist: httpx>=0.25; extra == "reranker-api"
Provides-Extra: reranker-litellm
Requires-Dist: ccw-litellm>=0.1; extra == "reranker-litellm"
Provides-Extra: reranker-legacy
Requires-Dist: sentence-transformers>=2.2; extra == "reranker-legacy"
Provides-Extra: reranker
Requires-Dist: optimum>=1.16; extra == "reranker"
Requires-Dist: onnxruntime>=1.15; extra == "reranker"
Requires-Dist: transformers>=4.36; extra == "reranker"
Provides-Extra: splade
Requires-Dist: transformers>=4.36; extra == "splade"
Requires-Dist: optimum[onnxruntime]>=1.16; extra == "splade"
Provides-Extra: splade-gpu
Requires-Dist: transformers>=4.36; extra == "splade-gpu"
Requires-Dist: optimum[onnxruntime-gpu]>=1.16; extra == "splade-gpu"
Provides-Extra: encoding
Requires-Dist: chardet>=5.0; extra == "encoding"
Provides-Extra: full

View File

@@ -8,6 +8,7 @@ src/codexlens/__init__.py
src/codexlens/__main__.py
src/codexlens/config.py
src/codexlens/entities.py
src/codexlens/env_config.py
src/codexlens/errors.py
src/codexlens/cli/__init__.py
src/codexlens/cli/commands.py
@@ -15,6 +16,7 @@ src/codexlens/cli/embedding_manager.py
src/codexlens/cli/model_manager.py
src/codexlens/cli/output.py
src/codexlens/indexing/__init__.py
src/codexlens/indexing/embedding.py
src/codexlens/indexing/symbol_extractor.py
src/codexlens/parsers/__init__.py
src/codexlens/parsers/encoding.py
@@ -24,6 +26,7 @@ src/codexlens/parsers/treesitter_parser.py
src/codexlens/search/__init__.py
src/codexlens/search/chain_search.py
src/codexlens/search/enrichment.py
src/codexlens/search/graph_expander.py
src/codexlens/search/hybrid_search.py
src/codexlens/search/query_parser.py
src/codexlens/search/ranking.py
@@ -37,28 +40,52 @@ src/codexlens/semantic/factory.py
src/codexlens/semantic/gpu_support.py
src/codexlens/semantic/litellm_embedder.py
src/codexlens/semantic/rotational_embedder.py
src/codexlens/semantic/splade_encoder.py
src/codexlens/semantic/vector_store.py
src/codexlens/semantic/reranker/__init__.py
src/codexlens/semantic/reranker/api_reranker.py
src/codexlens/semantic/reranker/base.py
src/codexlens/semantic/reranker/factory.py
src/codexlens/semantic/reranker/legacy.py
src/codexlens/semantic/reranker/litellm_reranker.py
src/codexlens/semantic/reranker/onnx_reranker.py
src/codexlens/storage/__init__.py
src/codexlens/storage/dir_index.py
src/codexlens/storage/file_cache.py
src/codexlens/storage/global_index.py
src/codexlens/storage/index_tree.py
src/codexlens/storage/merkle_tree.py
src/codexlens/storage/migration_manager.py
src/codexlens/storage/path_mapper.py
src/codexlens/storage/registry.py
src/codexlens/storage/splade_index.py
src/codexlens/storage/sqlite_store.py
src/codexlens/storage/sqlite_utils.py
src/codexlens/storage/vector_meta_store.py
src/codexlens/storage/migrations/__init__.py
src/codexlens/storage/migrations/migration_001_normalize_keywords.py
src/codexlens/storage/migrations/migration_002_add_token_metadata.py
src/codexlens/storage/migrations/migration_004_dual_fts.py
src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
src/codexlens/storage/migrations/migration_006_enhance_relationships.py
src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
src/codexlens/storage/migrations/migration_009_add_splade.py
src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
src/codexlens/watcher/__init__.py
src/codexlens/watcher/events.py
src/codexlens/watcher/file_watcher.py
src/codexlens/watcher/incremental_indexer.py
src/codexlens/watcher/manager.py
tests/test_ann_index.py
tests/test_api_reranker.py
tests/test_chain_search.py
tests/test_cli_hybrid_search.py
tests/test_cli_output.py
tests/test_code_extractor.py
tests/test_config.py
tests/test_dual_fts.py
tests/test_embedder.py
tests/test_embedding_backend_availability.py
tests/test_encoding.py
tests/test_enrichment.py
@@ -67,15 +94,22 @@ tests/test_errors.py
tests/test_file_cache.py
tests/test_global_index.py
tests/test_global_symbol_index.py
tests/test_graph_expansion.py
tests/test_hybrid_chunker.py
tests/test_hybrid_search_e2e.py
tests/test_hybrid_search_reranker_backend.py
tests/test_incremental_indexing.py
tests/test_litellm_reranker.py
tests/test_merkle_detection.py
tests/test_parser_integration.py
tests/test_parsers.py
tests/test_performance_optimizations.py
tests/test_pure_vector_search.py
tests/test_query_parser.py
tests/test_recursive_splitting.py
tests/test_registry.py
tests/test_reranker_backends.py
tests/test_reranker_factory.py
tests/test_result_grouping.py
tests/test_rrf_fusion.py
tests/test_schema_cleanup_migration.py
@@ -85,11 +119,14 @@ tests/test_search_full_coverage.py
tests/test_search_performance.py
tests/test_semantic.py
tests/test_semantic_search.py
tests/test_sqlite_store.py
tests/test_storage.py
tests/test_storage_concurrency.py
tests/test_symbol_extractor.py
tests/test_token_chunking.py
tests/test_token_storage.py
tests/test_tokenizer.py
tests/test_tokenizer_performance.py
tests/test_treesitter_parser.py
tests/test_vector_search_full.py
tests/test_vector_search_full.py
tests/test_vector_store.py

View File

@@ -6,6 +6,7 @@ tree-sitter-python>=0.25
tree-sitter-javascript>=0.25
tree-sitter-typescript>=0.23
pathspec>=0.11
watchdog>=3.0
[encoding]
chardet>=5.0
@@ -13,6 +14,25 @@ chardet>=5.0
[full]
tiktoken>=0.5.0
[reranker]
optimum>=1.16
onnxruntime>=1.15
transformers>=4.36
[reranker-api]
httpx>=0.25
[reranker-legacy]
sentence-transformers>=2.2
[reranker-litellm]
ccw-litellm>=0.1
[reranker-onnx]
optimum>=1.16
onnxruntime>=1.15
transformers>=4.36
[semantic]
numpy>=1.24
fastembed>=0.2
@@ -29,3 +49,11 @@ numpy>=1.24
fastembed>=0.2
hnswlib>=0.8.0
onnxruntime-gpu>=1.15.0
[splade]
transformers>=4.36
optimum[onnxruntime]>=1.16
[splade-gpu]
transformers>=4.36
optimum[onnxruntime-gpu]>=1.16