mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
Add comprehensive tests for schema cleanup migration and search comparison
- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE
|
||||
|
||||
|
||||
class Embedder:
|
||||
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
|
||||
"""Generate embeddings for code chunks using fastembed (ONNX-based).
|
||||
|
||||
MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
||||
EMBEDDING_DIM = 384
|
||||
Supported Model Profiles:
|
||||
- fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
|
||||
- code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
|
||||
- multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
|
||||
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str | None = None) -> None:
|
||||
# Model profiles for different use cases
|
||||
MODELS = {
|
||||
"fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight
|
||||
"code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized
|
||||
"multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual
|
||||
"balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy
|
||||
}
|
||||
|
||||
# Dimension mapping for each model
|
||||
MODEL_DIMS = {
|
||||
"BAAI/bge-small-en-v1.5": 384,
|
||||
"jinaai/jina-embeddings-v2-base-code": 768,
|
||||
"intfloat/multilingual-e5-large": 1024,
|
||||
"mixedbread-ai/mxbai-embed-large-v1": 1024,
|
||||
}
|
||||
|
||||
# Default model (fast profile)
|
||||
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
||||
DEFAULT_PROFILE = "fast"
|
||||
|
||||
def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
|
||||
"""Initialize embedder with model or profile.
|
||||
|
||||
Args:
|
||||
model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
|
||||
profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
|
||||
If both provided, model_name takes precedence.
|
||||
"""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
self.model_name = model_name or self.MODEL_NAME
|
||||
# Resolve model name from profile or use explicit name
|
||||
if model_name:
|
||||
self.model_name = model_name
|
||||
elif profile and profile in self.MODELS:
|
||||
self.model_name = self.MODELS[profile]
|
||||
else:
|
||||
self.model_name = self.DEFAULT_MODEL
|
||||
|
||||
self._model = None
|
||||
|
||||
@property
|
||||
def embedding_dim(self) -> int:
|
||||
"""Get embedding dimension for current model."""
|
||||
return self.MODEL_DIMS.get(self.model_name, 768) # Default to 768 if unknown
|
||||
|
||||
def _load_model(self) -> None:
|
||||
"""Lazy load the embedding model."""
|
||||
if self._model is not None:
|
||||
|
||||
Reference in New Issue
Block a user