Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
catlog22
2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions

View File

@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE
class Embedder:
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
"""Generate embeddings for code chunks using fastembed (ONNX-based).
MODEL_NAME = "BAAI/bge-small-en-v1.5"
EMBEDDING_DIM = 384
Supported Model Profiles:
- fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
- code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
- multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
"""
def __init__(self, model_name: str | None = None) -> None:
# Model profiles for different use cases
MODELS = {
"fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight
"code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized
"multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual
"balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy
}
# Dimension mapping for each model
MODEL_DIMS = {
"BAAI/bge-small-en-v1.5": 384,
"jinaai/jina-embeddings-v2-base-code": 768,
"intfloat/multilingual-e5-large": 1024,
"mixedbread-ai/mxbai-embed-large-v1": 1024,
}
# Default model (fast profile)
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
DEFAULT_PROFILE = "fast"
def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
"""Initialize embedder with model or profile.
Args:
model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
If both provided, model_name takes precedence.
"""
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self.model_name = model_name or self.MODEL_NAME
# Resolve model name from profile or use explicit name
if model_name:
self.model_name = model_name
elif profile and profile in self.MODELS:
self.model_name = self.MODELS[profile]
else:
self.model_name = self.DEFAULT_MODEL
self._model = None
@property
def embedding_dim(self) -> int:
"""Get embedding dimension for current model."""
return self.MODEL_DIMS.get(self.model_name, 768) # Default to 768 if unknown
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None: