Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2026-02-09 02:24:11 +08:00 · 2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -0,0 +1,331 @@
+"""Embedding Manager - Manage semantic embeddings for code indexes."""
+
+import logging
+import sqlite3
+import time
+from pathlib import Path
+from typing import Dict, List, Optional
+
+try:
+    from codexlens.semantic import SEMANTIC_AVAILABLE
+    if SEMANTIC_AVAILABLE:
+        from codexlens.semantic.embedder import Embedder
+        from codexlens.semantic.vector_store import VectorStore
+        from codexlens.semantic.chunker import Chunker, ChunkConfig
+except ImportError:
+    SEMANTIC_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+def check_index_embeddings(index_path: Path) -> Dict[str, any]:
+    """Check if an index has embeddings and return statistics.
+
+    Args:
+        index_path: Path to _index.db file
+
+    Returns:
+        Dictionary with embedding statistics and status
+    """
+    if not index_path.exists():
+        return {
+            "success": False,
+            "error": f"Index not found: {index_path}",
+        }
+
+    try:
+        with sqlite3.connect(index_path) as conn:
+            # Check if semantic_chunks table exists
+            cursor = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
+            )
+            table_exists = cursor.fetchone() is not None
+
+            if not table_exists:
+                # Count total indexed files even without embeddings
+                cursor = conn.execute("SELECT COUNT(*) FROM files")
+                total_files = cursor.fetchone()[0]
+
+                return {
+                    "success": True,
+                    "result": {
+                        "has_embeddings": False,
+                        "total_chunks": 0,
+                        "total_files": total_files,
+                        "files_with_chunks": 0,
+                        "files_without_chunks": total_files,
+                        "coverage_percent": 0.0,
+                        "missing_files_sample": [],
+                        "index_path": str(index_path),
+                    },
+                }
+
+            # Count total chunks
+            cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
+            total_chunks = cursor.fetchone()[0]
+
+            # Count total indexed files
+            cursor = conn.execute("SELECT COUNT(*) FROM files")
+            total_files = cursor.fetchone()[0]
+
+            # Count files with embeddings
+            cursor = conn.execute(
+                "SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
+            )
+            files_with_chunks = cursor.fetchone()[0]
+
+            # Get a sample of files without embeddings
+            cursor = conn.execute("""
+                SELECT full_path
+                FROM files
+                WHERE full_path NOT IN (
+                    SELECT DISTINCT file_path FROM semantic_chunks
+                )
+                LIMIT 5
+            """)
+            missing_files = [row[0] for row in cursor.fetchall()]
+
+            return {
+                "success": True,
+                "result": {
+                    "has_embeddings": total_chunks > 0,
+                    "total_chunks": total_chunks,
+                    "total_files": total_files,
+                    "files_with_chunks": files_with_chunks,
+                    "files_without_chunks": total_files - files_with_chunks,
+                    "coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
+                    "missing_files_sample": missing_files,
+                    "index_path": str(index_path),
+                },
+            }
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to check embeddings: {str(e)}",
+        }
+
+
+def generate_embeddings(
+    index_path: Path,
+    model_profile: str = "code",
+    force: bool = False,
+    chunk_size: int = 2000,
+    progress_callback: Optional[callable] = None,
+) -> Dict[str, any]:
+    """Generate embeddings for an index.
+
+    Args:
+        index_path: Path to _index.db file
+        model_profile: Model profile (fast, code, multilingual, balanced)
+        force: If True, regenerate even if embeddings exist
+        chunk_size: Maximum chunk size in characters
+        progress_callback: Optional callback for progress updates
+
+    Returns:
+        Result dictionary with generation statistics
+    """
+    if not SEMANTIC_AVAILABLE:
+        return {
+            "success": False,
+            "error": "Semantic search not available. Install with: pip install codexlens[semantic]",
+        }
+
+    if not index_path.exists():
+        return {
+            "success": False,
+            "error": f"Index not found: {index_path}",
+        }
+
+    # Check existing chunks
+    status = check_index_embeddings(index_path)
+    if not status["success"]:
+        return status
+
+    existing_chunks = status["result"]["total_chunks"]
+
+    if existing_chunks > 0 and not force:
+        return {
+            "success": False,
+            "error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
+            "existing_chunks": existing_chunks,
+        }
+
+    if force and existing_chunks > 0:
+        if progress_callback:
+            progress_callback(f"Clearing {existing_chunks} existing chunks...")
+
+        try:
+            with sqlite3.connect(index_path) as conn:
+                conn.execute("DELETE FROM semantic_chunks")
+                conn.commit()
+        except Exception as e:
+            return {
+                "success": False,
+                "error": f"Failed to clear existing chunks: {str(e)}",
+            }
+
+    # Initialize components
+    try:
+        embedder = Embedder(profile=model_profile)
+        vector_store = VectorStore(index_path)
+        chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
+
+        if progress_callback:
+            progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to initialize components: {str(e)}",
+        }
+
+    # Read files from index
+    try:
+        with sqlite3.connect(index_path) as conn:
+            conn.row_factory = sqlite3.Row
+            cursor = conn.execute("SELECT full_path, content, language FROM files")
+            files = cursor.fetchall()
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to read files: {str(e)}",
+        }
+
+    if len(files) == 0:
+        return {
+            "success": False,
+            "error": "No files found in index",
+        }
+
+    if progress_callback:
+        progress_callback(f"Processing {len(files)} files...")
+
+    # Process each file
+    total_chunks = 0
+    failed_files = []
+    start_time = time.time()
+
+    for idx, file_row in enumerate(files, 1):
+        file_path = file_row["full_path"]
+        content = file_row["content"]
+        language = file_row["language"] or "python"
+
+        try:
+            # Create chunks
+            chunks = chunker.chunk_sliding_window(
+                content,
+                file_path=file_path,
+                language=language
+            )
+
+            if not chunks:
+                continue
+
+            # Generate embeddings
+            for chunk in chunks:
+                embedding = embedder.embed_single(chunk.content)
+                chunk.embedding = embedding
+
+            # Store chunks
+            vector_store.add_chunks(chunks, file_path)
+            total_chunks += len(chunks)
+
+            if progress_callback:
+                progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
+
+        except Exception as e:
+            logger.error(f"Failed to process {file_path}: {e}")
+            failed_files.append((file_path, str(e)))
+
+    elapsed_time = time.time() - start_time
+
+    return {
+        "success": True,
+        "result": {
+            "chunks_created": total_chunks,
+            "files_processed": len(files) - len(failed_files),
+            "files_failed": len(failed_files),
+            "elapsed_time": elapsed_time,
+            "model_profile": model_profile,
+            "model_name": embedder.model_name,
+            "failed_files": failed_files[:5],  # First 5 failures
+            "index_path": str(index_path),
+        },
+    }
+
+
+def find_all_indexes(scan_dir: Path) -> List[Path]:
+    """Find all _index.db files in directory tree.
+
+    Args:
+        scan_dir: Directory to scan
+
+    Returns:
+        List of paths to _index.db files
+    """
+    if not scan_dir.exists():
+        return []
+
+    return list(scan_dir.rglob("_index.db"))
+
+
+def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
+    """Get summary statistics for all indexes in root directory.
+
+    Args:
+        index_root: Root directory containing indexes
+
+    Returns:
+        Summary statistics for all indexes
+    """
+    indexes = find_all_indexes(index_root)
+
+    if not indexes:
+        return {
+            "success": True,
+            "result": {
+                "total_indexes": 0,
+                "indexes_with_embeddings": 0,
+                "total_chunks": 0,
+                "indexes": [],
+            },
+        }
+
+    total_chunks = 0
+    indexes_with_embeddings = 0
+    index_stats = []
+
+    for index_path in indexes:
+        status = check_index_embeddings(index_path)
+
+        if status["success"]:
+            result = status["result"]
+            has_emb = result["has_embeddings"]
+            chunks = result["total_chunks"]
+
+            if has_emb:
+                indexes_with_embeddings += 1
+                total_chunks += chunks
+
+            # Extract project name from path
+            project_name = index_path.parent.name
+
+            index_stats.append({
+                "project": project_name,
+                "path": str(index_path),
+                "has_embeddings": has_emb,
+                "total_chunks": chunks,
+                "total_files": result["total_files"],
+                "coverage_percent": result.get("coverage_percent", 0),
+            })
+
+    return {
+        "success": True,
+        "result": {
+            "total_indexes": len(indexes),
+            "indexes_with_embeddings": indexes_with_embeddings,
+            "total_chunks": total_chunks,
+            "indexes": index_stats,
+        },
+    }