feat: Enhance embedding generation and search capabilities

- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance.
- Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving.
- Introduced SPLADE sparse index generation with improved handling and metadata storage.
- Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index.
- Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance.
- Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference.
- Improved `SpladeIndex` with cache size adjustments for better query performance.
- Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval.
- Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
This commit is contained in:
catlog22
2026-01-02 23:57:55 +08:00
parent 96b44e1482
commit 54fd94547c
12 changed files with 945 additions and 167 deletions

View File

@@ -541,26 +541,55 @@ class ChainSearchEngine:
)
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
# Search all indexes for binary candidates
# Try centralized BinarySearcher first (preferred for mmap indexes)
# The index root is the parent of the first index path
index_root = index_paths[0].parent if index_paths else None
all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path)
used_centralized = False
for index_path in index_paths:
try:
# Get or create binary index for this path
binary_index = self._get_or_create_binary_index(index_path)
if binary_index is None or binary_index.count() == 0:
continue
if index_root:
centralized_searcher = self._get_centralized_binary_searcher(index_root)
if centralized_searcher is not None:
try:
# BinarySearcher expects dense vector, not packed binary
from codexlens.semantic.embedder import Embedder
embedder = Embedder()
query_dense = embedder.embed_to_numpy([query])[0]
# Search binary index
ids, distances = binary_index.search(query_binary_packed, coarse_k)
for chunk_id, dist in zip(ids, distances):
all_candidates.append((chunk_id, dist, index_path))
# Centralized search - returns (chunk_id, hamming_distance) tuples
results = centralized_searcher.search(query_dense, top_k=coarse_k)
for chunk_id, dist in results:
all_candidates.append((chunk_id, dist, index_root))
used_centralized = True
self.logger.debug(
"Centralized binary search found %d candidates", len(results)
)
except Exception as exc:
self.logger.debug(
"Centralized binary search failed: %s, falling back to per-directory",
exc
)
centralized_searcher.clear()
except Exception as exc:
self.logger.debug(
"Binary search failed for %s: %s", index_path, exc
)
stats.errors.append(f"Binary search failed for {index_path}: {exc}")
# Fallback: Search per-directory indexes with legacy BinaryANNIndex
if not used_centralized:
for index_path in index_paths:
try:
# Get or create binary index for this path (uses deprecated BinaryANNIndex)
binary_index = self._get_or_create_binary_index(index_path)
if binary_index is None or binary_index.count() == 0:
continue
# Search binary index
ids, distances = binary_index.search(query_binary_packed, coarse_k)
for chunk_id, dist in zip(ids, distances):
all_candidates.append((chunk_id, dist, index_path))
except Exception as exc:
self.logger.debug(
"Binary search failed for %s: %s", index_path, exc
)
stats.errors.append(f"Binary search failed for {index_path}: {exc}")
if not all_candidates:
self.logger.debug("No binary candidates found, falling back to hybrid")
@@ -743,6 +772,10 @@ class ChainSearchEngine:
def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]:
"""Get or create a BinaryANNIndex for the given index path.
.. deprecated::
This method uses the deprecated BinaryANNIndex. For centralized indexes,
use _get_centralized_binary_searcher() instead.
Attempts to load an existing binary index from disk. If not found,
returns None (binary index should be built during indexing).
@@ -753,16 +786,48 @@ class ChainSearchEngine:
BinaryANNIndex instance or None if not available
"""
try:
from codexlens.semantic.ann_index import BinaryANNIndex
import warnings
# Suppress deprecation warning since we're using it intentionally for legacy support
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
from codexlens.semantic.ann_index import BinaryANNIndex
binary_index = BinaryANNIndex(index_path, dim=256)
if binary_index.load():
return binary_index
binary_index = BinaryANNIndex(index_path, dim=256)
if binary_index.load():
return binary_index
return None
except Exception as exc:
self.logger.debug("Failed to load binary index for %s: %s", index_path, exc)
return None
def _get_centralized_binary_searcher(self, index_root: Path) -> Optional[Any]:
"""Get centralized BinarySearcher for memory-mapped binary vectors.
This is the preferred method for centralized indexes, providing faster
search via memory-mapped files.
Args:
index_root: Root directory containing centralized index files
Returns:
BinarySearcher instance or None if not available
"""
try:
from codexlens.search.binary_searcher import BinarySearcher
binary_searcher = BinarySearcher(index_root)
if binary_searcher.load():
self.logger.debug(
"Using centralized BinarySearcher with %d vectors (mmap=%s)",
binary_searcher.vector_count,
binary_searcher.is_memmap
)
return binary_searcher
return None
except Exception as exc:
self.logger.debug("Failed to load centralized binary searcher: %s", exc)
return None
def _compute_cosine_similarity(
self,
query_vec: "np.ndarray",