mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
feat: Enhance embedding generation and search capabilities
- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance. - Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving. - Introduced SPLADE sparse index generation with improved handling and metadata storage. - Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index. - Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance. - Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference. - Improved `SpladeIndex` with cache size adjustments for better query performance. - Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval. - Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
This commit is contained in:
@@ -15,6 +15,7 @@ Requires-Dist: tree-sitter-python>=0.25
|
||||
Requires-Dist: tree-sitter-javascript>=0.25
|
||||
Requires-Dist: tree-sitter-typescript>=0.23
|
||||
Requires-Dist: pathspec>=0.11
|
||||
Requires-Dist: watchdog>=3.0
|
||||
Provides-Extra: semantic
|
||||
Requires-Dist: numpy>=1.24; extra == "semantic"
|
||||
Requires-Dist: fastembed>=0.2; extra == "semantic"
|
||||
@@ -29,6 +30,26 @@ Requires-Dist: numpy>=1.24; extra == "semantic-directml"
|
||||
Requires-Dist: fastembed>=0.2; extra == "semantic-directml"
|
||||
Requires-Dist: hnswlib>=0.8.0; extra == "semantic-directml"
|
||||
Requires-Dist: onnxruntime-directml>=1.15.0; extra == "semantic-directml"
|
||||
Provides-Extra: reranker-onnx
|
||||
Requires-Dist: optimum>=1.16; extra == "reranker-onnx"
|
||||
Requires-Dist: onnxruntime>=1.15; extra == "reranker-onnx"
|
||||
Requires-Dist: transformers>=4.36; extra == "reranker-onnx"
|
||||
Provides-Extra: reranker-api
|
||||
Requires-Dist: httpx>=0.25; extra == "reranker-api"
|
||||
Provides-Extra: reranker-litellm
|
||||
Requires-Dist: ccw-litellm>=0.1; extra == "reranker-litellm"
|
||||
Provides-Extra: reranker-legacy
|
||||
Requires-Dist: sentence-transformers>=2.2; extra == "reranker-legacy"
|
||||
Provides-Extra: reranker
|
||||
Requires-Dist: optimum>=1.16; extra == "reranker"
|
||||
Requires-Dist: onnxruntime>=1.15; extra == "reranker"
|
||||
Requires-Dist: transformers>=4.36; extra == "reranker"
|
||||
Provides-Extra: splade
|
||||
Requires-Dist: transformers>=4.36; extra == "splade"
|
||||
Requires-Dist: optimum[onnxruntime]>=1.16; extra == "splade"
|
||||
Provides-Extra: splade-gpu
|
||||
Requires-Dist: transformers>=4.36; extra == "splade-gpu"
|
||||
Requires-Dist: optimum[onnxruntime-gpu]>=1.16; extra == "splade-gpu"
|
||||
Provides-Extra: encoding
|
||||
Requires-Dist: chardet>=5.0; extra == "encoding"
|
||||
Provides-Extra: full
|
||||
|
||||
@@ -8,6 +8,7 @@ src/codexlens/__init__.py
|
||||
src/codexlens/__main__.py
|
||||
src/codexlens/config.py
|
||||
src/codexlens/entities.py
|
||||
src/codexlens/env_config.py
|
||||
src/codexlens/errors.py
|
||||
src/codexlens/cli/__init__.py
|
||||
src/codexlens/cli/commands.py
|
||||
@@ -15,6 +16,7 @@ src/codexlens/cli/embedding_manager.py
|
||||
src/codexlens/cli/model_manager.py
|
||||
src/codexlens/cli/output.py
|
||||
src/codexlens/indexing/__init__.py
|
||||
src/codexlens/indexing/embedding.py
|
||||
src/codexlens/indexing/symbol_extractor.py
|
||||
src/codexlens/parsers/__init__.py
|
||||
src/codexlens/parsers/encoding.py
|
||||
@@ -24,6 +26,7 @@ src/codexlens/parsers/treesitter_parser.py
|
||||
src/codexlens/search/__init__.py
|
||||
src/codexlens/search/chain_search.py
|
||||
src/codexlens/search/enrichment.py
|
||||
src/codexlens/search/graph_expander.py
|
||||
src/codexlens/search/hybrid_search.py
|
||||
src/codexlens/search/query_parser.py
|
||||
src/codexlens/search/ranking.py
|
||||
@@ -37,28 +40,52 @@ src/codexlens/semantic/factory.py
|
||||
src/codexlens/semantic/gpu_support.py
|
||||
src/codexlens/semantic/litellm_embedder.py
|
||||
src/codexlens/semantic/rotational_embedder.py
|
||||
src/codexlens/semantic/splade_encoder.py
|
||||
src/codexlens/semantic/vector_store.py
|
||||
src/codexlens/semantic/reranker/__init__.py
|
||||
src/codexlens/semantic/reranker/api_reranker.py
|
||||
src/codexlens/semantic/reranker/base.py
|
||||
src/codexlens/semantic/reranker/factory.py
|
||||
src/codexlens/semantic/reranker/legacy.py
|
||||
src/codexlens/semantic/reranker/litellm_reranker.py
|
||||
src/codexlens/semantic/reranker/onnx_reranker.py
|
||||
src/codexlens/storage/__init__.py
|
||||
src/codexlens/storage/dir_index.py
|
||||
src/codexlens/storage/file_cache.py
|
||||
src/codexlens/storage/global_index.py
|
||||
src/codexlens/storage/index_tree.py
|
||||
src/codexlens/storage/merkle_tree.py
|
||||
src/codexlens/storage/migration_manager.py
|
||||
src/codexlens/storage/path_mapper.py
|
||||
src/codexlens/storage/registry.py
|
||||
src/codexlens/storage/splade_index.py
|
||||
src/codexlens/storage/sqlite_store.py
|
||||
src/codexlens/storage/sqlite_utils.py
|
||||
src/codexlens/storage/vector_meta_store.py
|
||||
src/codexlens/storage/migrations/__init__.py
|
||||
src/codexlens/storage/migrations/migration_001_normalize_keywords.py
|
||||
src/codexlens/storage/migrations/migration_002_add_token_metadata.py
|
||||
src/codexlens/storage/migrations/migration_004_dual_fts.py
|
||||
src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
|
||||
src/codexlens/storage/migrations/migration_006_enhance_relationships.py
|
||||
src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
|
||||
src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
|
||||
src/codexlens/storage/migrations/migration_009_add_splade.py
|
||||
src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
|
||||
src/codexlens/watcher/__init__.py
|
||||
src/codexlens/watcher/events.py
|
||||
src/codexlens/watcher/file_watcher.py
|
||||
src/codexlens/watcher/incremental_indexer.py
|
||||
src/codexlens/watcher/manager.py
|
||||
tests/test_ann_index.py
|
||||
tests/test_api_reranker.py
|
||||
tests/test_chain_search.py
|
||||
tests/test_cli_hybrid_search.py
|
||||
tests/test_cli_output.py
|
||||
tests/test_code_extractor.py
|
||||
tests/test_config.py
|
||||
tests/test_dual_fts.py
|
||||
tests/test_embedder.py
|
||||
tests/test_embedding_backend_availability.py
|
||||
tests/test_encoding.py
|
||||
tests/test_enrichment.py
|
||||
@@ -67,15 +94,22 @@ tests/test_errors.py
|
||||
tests/test_file_cache.py
|
||||
tests/test_global_index.py
|
||||
tests/test_global_symbol_index.py
|
||||
tests/test_graph_expansion.py
|
||||
tests/test_hybrid_chunker.py
|
||||
tests/test_hybrid_search_e2e.py
|
||||
tests/test_hybrid_search_reranker_backend.py
|
||||
tests/test_incremental_indexing.py
|
||||
tests/test_litellm_reranker.py
|
||||
tests/test_merkle_detection.py
|
||||
tests/test_parser_integration.py
|
||||
tests/test_parsers.py
|
||||
tests/test_performance_optimizations.py
|
||||
tests/test_pure_vector_search.py
|
||||
tests/test_query_parser.py
|
||||
tests/test_recursive_splitting.py
|
||||
tests/test_registry.py
|
||||
tests/test_reranker_backends.py
|
||||
tests/test_reranker_factory.py
|
||||
tests/test_result_grouping.py
|
||||
tests/test_rrf_fusion.py
|
||||
tests/test_schema_cleanup_migration.py
|
||||
@@ -85,11 +119,14 @@ tests/test_search_full_coverage.py
|
||||
tests/test_search_performance.py
|
||||
tests/test_semantic.py
|
||||
tests/test_semantic_search.py
|
||||
tests/test_sqlite_store.py
|
||||
tests/test_storage.py
|
||||
tests/test_storage_concurrency.py
|
||||
tests/test_symbol_extractor.py
|
||||
tests/test_token_chunking.py
|
||||
tests/test_token_storage.py
|
||||
tests/test_tokenizer.py
|
||||
tests/test_tokenizer_performance.py
|
||||
tests/test_treesitter_parser.py
|
||||
tests/test_vector_search_full.py
|
||||
tests/test_vector_search_full.py
|
||||
tests/test_vector_store.py
|
||||
@@ -6,6 +6,7 @@ tree-sitter-python>=0.25
|
||||
tree-sitter-javascript>=0.25
|
||||
tree-sitter-typescript>=0.23
|
||||
pathspec>=0.11
|
||||
watchdog>=3.0
|
||||
|
||||
[encoding]
|
||||
chardet>=5.0
|
||||
@@ -13,6 +14,25 @@ chardet>=5.0
|
||||
[full]
|
||||
tiktoken>=0.5.0
|
||||
|
||||
[reranker]
|
||||
optimum>=1.16
|
||||
onnxruntime>=1.15
|
||||
transformers>=4.36
|
||||
|
||||
[reranker-api]
|
||||
httpx>=0.25
|
||||
|
||||
[reranker-legacy]
|
||||
sentence-transformers>=2.2
|
||||
|
||||
[reranker-litellm]
|
||||
ccw-litellm>=0.1
|
||||
|
||||
[reranker-onnx]
|
||||
optimum>=1.16
|
||||
onnxruntime>=1.15
|
||||
transformers>=4.36
|
||||
|
||||
[semantic]
|
||||
numpy>=1.24
|
||||
fastembed>=0.2
|
||||
@@ -29,3 +49,11 @@ numpy>=1.24
|
||||
fastembed>=0.2
|
||||
hnswlib>=0.8.0
|
||||
onnxruntime-gpu>=1.15.0
|
||||
|
||||
[splade]
|
||||
transformers>=4.36
|
||||
optimum[onnxruntime]>=1.16
|
||||
|
||||
[splade-gpu]
|
||||
transformers>=4.36
|
||||
optimum[onnxruntime-gpu]>=1.16
|
||||
|
||||
Reference in New Issue
Block a user