feat: Add centralized vector storage and metadata management for embeddings

2026-03-21 19:08:17 +08:00 · 2026-01-02 17:18:23 +08:00
parent 9157c5c78b
commit 0b6e9db8e4
5 changed files with 534 additions and 11 deletions
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -664,10 +664,15 @@ class HybridSearchEngine:
        scores: List[float],
        category: Optional[str] = None,
    ) -> List[SearchResult]:
-        """Fetch chunk metadata from all _index.db files for centralized search.
+        """Fetch chunk metadata from centralized _vectors_meta.db for fast lookup.
+
+        This method uses the centralized VectorMetadataStore for O(1) lookup
+        instead of traversing all _index.db files (O(n) where n = number of indexes).
+
+        Falls back to the legacy per-index lookup if centralized metadata is unavailable.

        Args:
-            index_root: Root directory containing _index.db files
+            index_root: Root directory containing _vectors_meta.db
            chunk_ids: List of chunk IDs from ANN search
            scores: Corresponding similarity scores
            category: Optional category filter
@@ -675,12 +680,123 @@ class HybridSearchEngine:
        Returns:
            List of SearchResult objects
        """
-        import sqlite3
-        import json
+        from codexlens.config import VECTORS_META_DB_NAME

        # Build score map
        score_map = {cid: score for cid, score in zip(chunk_ids, scores)}

+        # Try centralized metadata store first (fast path)
+        vectors_meta_path = index_root / VECTORS_META_DB_NAME
+        if vectors_meta_path.exists():
+            try:
+                return self._fetch_from_vector_meta_store(
+                    vectors_meta_path, chunk_ids, score_map, category
+                )
+            except Exception as e:
+                self.logger.debug(
+                    "Centralized metadata lookup failed, falling back: %s", e
+                )
+
+        # Fallback: traverse _index.db files (legacy path)
+        return self._fetch_chunks_by_ids_legacy(
+            index_root, chunk_ids, score_map, category
+        )
+
+    def _fetch_from_vector_meta_store(
+        self,
+        meta_db_path: Path,
+        chunk_ids: List[int],
+        score_map: Dict[int, float],
+        category: Optional[str] = None,
+    ) -> List[SearchResult]:
+        """Fetch chunks from centralized VectorMetadataStore.
+
+        Args:
+            meta_db_path: Path to _vectors_meta.db
+            chunk_ids: List of chunk IDs to fetch
+            score_map: Mapping of chunk_id to score
+            category: Optional category filter
+
+        Returns:
+            List of SearchResult objects
+        """
+        from codexlens.storage.vector_meta_store import VectorMetadataStore
+
+        results = []
+
+        with VectorMetadataStore(meta_db_path) as meta_store:
+            rows = meta_store.get_chunks_by_ids(chunk_ids, category=category)
+
+            for row in rows:
+                chunk_id = row["chunk_id"]
+                file_path = row["file_path"]
+                content = row["content"] or ""
+                metadata = row.get("metadata") or {}
+                start_line = row.get("start_line")
+                end_line = row.get("end_line")
+
+                score = score_map.get(chunk_id, 0.0)
+
+                # Build excerpt
+                excerpt = content[:200] + "..." if len(content) > 200 else content
+
+                # Extract symbol information
+                symbol_name = metadata.get("symbol_name")
+                symbol_kind = metadata.get("symbol_kind")
+
+                # Build Symbol object if available
+                symbol = None
+                if symbol_name and symbol_kind and start_line and end_line:
+                    try:
+                        from codexlens.entities import Symbol
+                        symbol = Symbol(
+                            name=symbol_name,
+                            kind=symbol_kind,
+                            range=(start_line, end_line)
+                        )
+                    except Exception:
+                        pass
+
+                results.append(SearchResult(
+                    path=file_path,
+                    score=score,
+                    excerpt=excerpt,
+                    content=content,
+                    symbol=symbol,
+                    metadata=metadata,
+                    start_line=start_line,
+                    end_line=end_line,
+                    symbol_name=symbol_name,
+                    symbol_kind=symbol_kind,
+                ))
+
+        # Sort by score descending
+        results.sort(key=lambda r: r.score, reverse=True)
+        return results
+
+    def _fetch_chunks_by_ids_legacy(
+        self,
+        index_root: Path,
+        chunk_ids: List[int],
+        score_map: Dict[int, float],
+        category: Optional[str] = None,
+    ) -> List[SearchResult]:
+        """Legacy fallback: fetch chunk metadata by traversing all _index.db files.
+
+        This is the O(n) fallback path used when centralized metadata is unavailable.
+
+        Args:
+            index_root: Root directory containing _index.db files
+            chunk_ids: List of chunk IDs from ANN search
+            score_map: Mapping of chunk_id to score
+            category: Optional category filter
+
+        Returns:
+            List of SearchResult objects
+        """
+        import sqlite3
+        import json
+
        # Find all _index.db files
        index_files = list(index_root.rglob("_index.db"))