Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2026-02-10 02:24:35 +08:00 · 2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions
--- a/codex-lens/src/codexlens/semantic/embedder.py
+++ b/codex-lens/src/codexlens/semantic/embedder.py
@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE


 class Embedder:
-    """Generate embeddings for code chunks using fastembed (ONNX-based)."""
+    """Generate embeddings for code chunks using fastembed (ONNX-based).

-    MODEL_NAME = "BAAI/bge-small-en-v1.5"
-    EMBEDDING_DIM = 384
+    Supported Model Profiles:
+    - fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
+    - code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
+    - multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
+    - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
+    """

-    def __init__(self, model_name: str | None = None) -> None:
+    # Model profiles for different use cases
+    MODELS = {
+        "fast": "BAAI/bge-small-en-v1.5",           # 384 dim - Fast, lightweight
+        "code": "jinaai/jina-embeddings-v2-base-code",  # 768 dim - Code-optimized
+        "multilingual": "intfloat/multilingual-e5-large",  # 1024 dim - Multilingual
+        "balanced": "mixedbread-ai/mxbai-embed-large-v1",  # 1024 dim - High accuracy
+    }
+
+    # Dimension mapping for each model
+    MODEL_DIMS = {
+        "BAAI/bge-small-en-v1.5": 384,
+        "jinaai/jina-embeddings-v2-base-code": 768,
+        "intfloat/multilingual-e5-large": 1024,
+        "mixedbread-ai/mxbai-embed-large-v1": 1024,
+    }
+
+    # Default model (fast profile)
+    DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
+    DEFAULT_PROFILE = "fast"
+
+    def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
+        """Initialize embedder with model or profile.
+
+        Args:
+            model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
+            profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
+                    If both provided, model_name takes precedence.
+        """
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

-        self.model_name = model_name or self.MODEL_NAME
+        # Resolve model name from profile or use explicit name
+        if model_name:
+            self.model_name = model_name
+        elif profile and profile in self.MODELS:
+            self.model_name = self.MODELS[profile]
+        else:
+            self.model_name = self.DEFAULT_MODEL
+
        self._model = None

+    @property
+    def embedding_dim(self) -> int:
+        """Get embedding dimension for current model."""
+        return self.MODEL_DIMS.get(self.model_name, 768)  # Default to 768 if unknown
+
    def _load_model(self) -> None:
        """Lazy load the embedding model."""
        if self._model is not None: