From 740bd1b61e1568a530b1b7d87775bb0d5ee764f2 Mon Sep 17 00:00:00 2001 From: catlog22 Date: Sat, 3 Jan 2026 11:47:07 +0800 Subject: [PATCH] fix(codexlens): Fix constructor and path handling issues 1. GlobalSymbolIndex constructor: Add project_id parameter lookup - Get project_id from registry using source_root - Pass project_id to GlobalSymbolIndex constructor 2. Binary cascade search path handling: - Add VectorMetadataStore import for centralized search - Fix _build_results_from_candidates to handle centralized mode - Use VectorMetadataStore for metadata, source_index_db for embeddings - Properly distinguish between index_root and index_path 3. Dense reranking for centralized search: - Get chunk metadata from _vectors_meta.db - Group chunks by source_index_db - Retrieve dense embeddings from respective _index.db files --- .../src/codexlens/search/chain_search.py | 139 +++++++++++++++--- .../codexlens/watcher/incremental_indexer.py | 28 ++-- 2 files changed, 135 insertions(+), 32 deletions(-) diff --git a/codex-lens/src/codexlens/search/chain_search.py b/codex-lens/src/codexlens/search/chain_search.py index 7bfb8abf..b131b582 100644 --- a/codex-lens/src/codexlens/search/chain_search.py +++ b/codex-lens/src/codexlens/search/chain_search.py @@ -30,6 +30,8 @@ from codexlens.storage.dir_index import DirIndexStore, SubdirLink from codexlens.storage.global_index import GlobalSymbolIndex from codexlens.storage.path_mapper import PathMapper from codexlens.storage.sqlite_store import SQLiteStore +from codexlens.storage.vector_meta_store import VectorMetadataStore +from codexlens.config import VECTORS_META_DB_NAME from codexlens.search.hybrid_search import HybridSearchEngine @@ -49,6 +51,8 @@ class SearchOptions: enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True) enable_vector: Enable vector semantic search (default False) pure_vector: If True, only use vector search without FTS fallback (default False) + enable_splade: Enable SPLADE sparse neural search (default False) + enable_cascade: Enable cascade (binary+dense) two-stage retrieval (default False) hybrid_weights: Custom RRF weights for hybrid search (optional) group_results: Enable grouping of similar results (default False) grouping_threshold: Score threshold for grouping similar results (default 0.01) @@ -64,6 +68,8 @@ class SearchOptions: enable_fuzzy: bool = True enable_vector: bool = False pure_vector: bool = False + enable_splade: bool = False + enable_cascade: bool = False hybrid_weights: Optional[Dict[str, float]] = None group_results: bool = False grouping_threshold: float = 0.01 @@ -622,7 +628,8 @@ class ChainSearchEngine: ) # Fall back to using Hamming distance as score return self._build_results_from_candidates( - coarse_candidates[:k], index_paths, stats, query, start_time + coarse_candidates[:k], index_paths, stats, query, start_time, + use_centralized=used_centralized ) # Group candidates by index path for batch retrieval @@ -634,30 +641,96 @@ class ChainSearchEngine: # Retrieve dense embeddings and compute cosine similarity scored_results: List[Tuple[float, SearchResult]] = [] + import sqlite3 for index_path, chunk_ids in candidates_by_index.items(): try: - # Read directly from semantic_chunks table (where cascade-index stores data) - import sqlite3 - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row + # Collect valid rows and dense vectors for batch processing + valid_rows: List[Dict[str, Any]] = [] + dense_vectors: List["np.ndarray"] = [] - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - conn.close() + if used_centralized: + # Centralized mode: index_path is actually index_root directory + # Dense embeddings are in per-directory _index.db files + # referenced by source_index_db in chunk_metadata + meta_db_path = index_path / VECTORS_META_DB_NAME + if not meta_db_path.exists(): + self.logger.debug( + "VectorMetadataStore not found at %s, skipping dense reranking", meta_db_path + ) + continue - # Batch processing: collect all valid embeddings first - valid_rows = [] - dense_vectors = [] - for row in rows: - dense_bytes = row["embedding_dense"] - if dense_bytes is not None: - valid_rows.append(row) - dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) + # Get chunk metadata with source_index_db references + meta_store = VectorMetadataStore(meta_db_path) + chunks_meta = meta_store.get_chunks_by_ids(chunk_ids) + # Group chunks by source_index_db + chunks_by_source: Dict[str, List[Dict[str, Any]]] = {} + for chunk in chunks_meta: + source_db = chunk.get("source_index_db") + if source_db: + if source_db not in chunks_by_source: + chunks_by_source[source_db] = [] + chunks_by_source[source_db].append(chunk) + + # Retrieve dense embeddings from each source_index_db + for source_db, source_chunks in chunks_by_source.items(): + try: + source_chunk_ids = [c["chunk_id"] for c in source_chunks] + conn = sqlite3.connect(source_db) + conn.row_factory = sqlite3.Row + + placeholders = ",".join("?" * len(source_chunk_ids)) + # Try semantic_chunks first (newer schema), fall back to chunks + try: + rows = conn.execute( + f"SELECT id, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", + source_chunk_ids + ).fetchall() + except sqlite3.OperationalError: + rows = conn.execute( + f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})", + source_chunk_ids + ).fetchall() + conn.close() + + # Build dense vector lookup + dense_lookup = {row["id"]: row["embedding_dense"] for row in rows} + + # Process chunks with their embeddings + for chunk in source_chunks: + chunk_id = chunk["chunk_id"] + dense_bytes = dense_lookup.get(chunk_id) + if dense_bytes is not None: + valid_rows.append({ + "id": chunk_id, + "file_path": chunk["file_path"], + "content": chunk["content"], + }) + dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) + except Exception as exc: + self.logger.debug( + "Failed to get dense embeddings from %s: %s", source_db, exc + ) + else: + # Per-directory mode: index_path is the _index.db file + conn = sqlite3.connect(str(index_path)) + conn.row_factory = sqlite3.Row + + placeholders = ",".join("?" * len(chunk_ids)) + rows = conn.execute( + f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", + chunk_ids + ).fetchall() + conn.close() + + for row in rows: + dense_bytes = row["embedding_dense"] + if dense_bytes is not None: + valid_rows.append(dict(row)) + dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) + + # Skip if no dense embeddings found if not dense_vectors: continue @@ -670,9 +743,9 @@ class ChainSearchEngine: # Create search results for i, row in enumerate(valid_rows): score = float(scores[i]) - excerpt = (row["content"] or "")[:500] + excerpt = (row.get("content") or "")[:500] result = SearchResult( - path=row["file_path"] or "", + path=row.get("file_path") or "", score=score, excerpt=excerpt, ) @@ -919,6 +992,7 @@ class ChainSearchEngine: stats: SearchStats, query: str, start_time: float, + use_centralized: bool = False, ) -> ChainSearchResult: """Build ChainSearchResult from binary candidates using Hamming distance scores. @@ -930,6 +1004,8 @@ class ChainSearchEngine: stats: SearchStats to update query: Original query string start_time: Search start time for timing + use_centralized: If True, index_path is the index_root directory + and VectorMetadataStore should be used instead of SQLiteStore Returns: ChainSearchResult with results scored by Hamming distance @@ -945,9 +1021,22 @@ class ChainSearchEngine: for index_path, chunk_tuples in candidates_by_index.items(): try: - store = SQLiteStore(index_path) chunk_ids = [c[0] for c in chunk_tuples] - chunks_data = store.get_chunks_by_ids(chunk_ids) + + # Use VectorMetadataStore for centralized search, SQLiteStore for per-directory + if use_centralized: + # index_path is actually index_root directory for centralized search + meta_db_path = index_path / VECTORS_META_DB_NAME + if not meta_db_path.exists(): + self.logger.debug( + "VectorMetadataStore not found at %s, skipping", meta_db_path + ) + continue + meta_store = VectorMetadataStore(meta_db_path) + chunks_data = meta_store.get_chunks_by_ids(chunk_ids) + else: + store = SQLiteStore(index_path) + chunks_data = store.get_chunks_by_ids(chunk_ids) chunk_content: Dict[int, Dict[str, Any]] = { c["id"]: c for c in chunks_data @@ -1341,6 +1430,7 @@ class ChainSearchEngine: options.enable_fuzzy, options.enable_vector, options.pure_vector, + options.enable_splade, options.hybrid_weights ): idx_path for idx_path in index_paths @@ -1369,6 +1459,7 @@ class ChainSearchEngine: enable_fuzzy: bool = True, enable_vector: bool = False, pure_vector: bool = False, + enable_splade: bool = False, hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]: """Search a single index database. @@ -1384,6 +1475,7 @@ class ChainSearchEngine: enable_fuzzy: Enable fuzzy FTS in hybrid mode enable_vector: Enable vector semantic search pure_vector: If True, only use vector search without FTS fallback + enable_splade: If True, force SPLADE sparse neural search hybrid_weights: Custom RRF weights for hybrid search Returns: @@ -1400,6 +1492,7 @@ class ChainSearchEngine: enable_fuzzy=enable_fuzzy, enable_vector=enable_vector, pure_vector=pure_vector, + enable_splade=enable_splade, ) else: # Single-FTS search (exact or fuzzy mode) diff --git a/codex-lens/src/codexlens/watcher/incremental_indexer.py b/codex-lens/src/codexlens/watcher/incremental_indexer.py index bb836034..9991c5fc 100644 --- a/codex-lens/src/codexlens/watcher/incremental_indexer.py +++ b/codex-lens/src/codexlens/watcher/incremental_indexer.py @@ -70,16 +70,27 @@ class IncrementalIndexer: self._dir_stores: dict[Path, DirIndexStore] = {} self._lock = __import__("threading").RLock() - def _get_global_index(self, index_root: Path) -> Optional[GlobalSymbolIndex]: - """Get or create global symbol index.""" + def _get_global_index(self, index_root: Path, source_root: Optional[Path] = None) -> Optional[GlobalSymbolIndex]: + """Get or create global symbol index. + + Args: + index_root: Root directory containing the global symbol index DB + source_root: Source directory root for looking up project_id from registry + """ if not self.config.global_symbol_index_enabled: return None - + if self._global_index is None: global_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME if global_db_path.exists(): - self._global_index = GlobalSymbolIndex(global_db_path) - + # Get project_id from registry using source_root + project_id = 0 # Default fallback + if source_root: + project_info = self.registry.get_project(source_root) + if project_info: + project_id = project_info.id + self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id) + return self._global_index def _get_dir_store(self, dir_path: Path) -> Optional[DirIndexStore]: @@ -94,10 +105,9 @@ class IncrementalIndexer: return None # Get index root for global index - index_root = self.mapper.source_to_index_dir( - self.mapper.get_project_root(dir_path) or dir_path - ) - global_index = self._get_global_index(index_root) + source_root = self.mapper.get_project_root(dir_path) or dir_path + index_root = self.mapper.source_to_index_dir(source_root) + global_index = self._get_global_index(index_root, source_root=source_root) store = DirIndexStore( index_db,