Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2026-02-14 02:42:04 +08:00 · 2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -50,35 +50,68 @@ class HybridSearchEngine:
        limit: int = 20,
        enable_fuzzy: bool = True,
        enable_vector: bool = False,
+        pure_vector: bool = False,
    ) -> List[SearchResult]:
        """Execute hybrid search with parallel retrieval and RRF fusion.

        Args:
            index_path: Path to _index.db file
-            query: FTS5 query string
+            query: FTS5 query string (for FTS) or natural language query (for vector)
            limit: Maximum results to return after fusion
            enable_fuzzy: Enable fuzzy FTS search (default True)
            enable_vector: Enable vector search (default False)
+            pure_vector: If True, only use vector search without FTS fallback (default False)

        Returns:
            List of SearchResult objects sorted by fusion score

        Examples:
            >>> engine = HybridSearchEngine()
-            >>> results = engine.search(Path("project/_index.db"), "authentication")
+            >>> # Hybrid search (exact + fuzzy + vector)
+            >>> results = engine.search(Path("project/_index.db"), "authentication",
+            ...                         enable_vector=True)
+            >>> # Pure vector search (semantic only)
+            >>> results = engine.search(Path("project/_index.db"),
+            ...                         "how to authenticate users",
+            ...                         enable_vector=True, pure_vector=True)
            >>> for r in results[:5]:
            ...     print(f"{r.path}: {r.score:.3f}")
        """
        # Determine which backends to use
-        backends = {"exact": True}  # Always use exact search
-        if enable_fuzzy:
-            backends["fuzzy"] = True
-        if enable_vector:
-            backends["vector"] = True
+        backends = {}
+
+        if pure_vector:
+            # Pure vector mode: only use vector search, no FTS fallback
+            if enable_vector:
+                backends["vector"] = True
+            else:
+                # Invalid configuration: pure_vector=True but enable_vector=False
+                self.logger.warning(
+                    "pure_vector=True requires enable_vector=True. "
+                    "Falling back to exact search. "
+                    "To use pure vector search, enable vector search mode."
+                )
+                backends["exact"] = True
+        else:
+            # Hybrid mode: always include exact search as baseline
+            backends["exact"] = True
+            if enable_fuzzy:
+                backends["fuzzy"] = True
+            if enable_vector:
+                backends["vector"] = True

        # Execute parallel searches
        results_map = self._search_parallel(index_path, query, backends, limit)

+        # Provide helpful message if pure-vector mode returns no results
+        if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
+            self.logger.warning(
+                "Pure vector search returned no results. "
+                "This usually means embeddings haven't been generated. "
+                "Run: codexlens embeddings-generate %s",
+                index_path.parent if index_path.name == "_index.db" else index_path
+            )
+
        # Apply RRF fusion
        # Filter weights to only active backends
        active_weights = {
@@ -195,17 +228,67 @@ class HybridSearchEngine:
    def _search_vector(
        self, index_path: Path, query: str, limit: int
    ) -> List[SearchResult]:
-        """Execute vector search (placeholder for future implementation).
+        """Execute vector similarity search using semantic embeddings.

        Args:
            index_path: Path to _index.db file
-            query: Query string
+            query: Natural language query string
            limit: Maximum results

        Returns:
-            List of SearchResult objects (empty for now)
+            List of SearchResult objects ordered by semantic similarity
        """
-        # Placeholder for vector search integration
-        # Will be implemented when VectorStore is available
-        self.logger.debug("Vector search not yet implemented")
-        return []
+        try:
+            # Check if semantic chunks table exists
+            import sqlite3
+            conn = sqlite3.connect(index_path)
+            cursor = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
+            )
+            has_semantic_table = cursor.fetchone() is not None
+            conn.close()
+
+            if not has_semantic_table:
+                self.logger.info(
+                    "No embeddings found in index. "
+                    "Generate embeddings with: codexlens embeddings-generate %s",
+                    index_path.parent if index_path.name == "_index.db" else index_path
+                )
+                return []
+
+            # Initialize embedder and vector store
+            from codexlens.semantic.embedder import Embedder
+            from codexlens.semantic.vector_store import VectorStore
+
+            embedder = Embedder(profile="code")  # Use code-optimized model
+            vector_store = VectorStore(index_path)
+
+            # Check if vector store has data
+            if vector_store.count_chunks() == 0:
+                self.logger.info(
+                    "Vector store is empty (0 chunks). "
+                    "Generate embeddings with: codexlens embeddings-generate %s",
+                    index_path.parent if index_path.name == "_index.db" else index_path
+                )
+                return []
+
+            # Generate query embedding
+            query_embedding = embedder.embed_single(query)
+
+            # Search for similar chunks
+            results = vector_store.search_similar(
+                query_embedding=query_embedding,
+                top_k=limit,
+                min_score=0.0,  # Return all results, let RRF handle filtering
+                return_full_content=True,
+            )
+
+            self.logger.debug("Vector search found %d results", len(results))
+            return results
+
+        except ImportError as exc:
+            self.logger.debug("Semantic dependencies not available: %s", exc)
+            return []
+        except Exception as exc:
+            self.logger.error("Vector search error: %s", exc)
+            return []