Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
catlog22
2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions

View File

@@ -0,0 +1,331 @@
"""Embedding Manager - Manage semantic embeddings for code indexes."""
import logging
import sqlite3
import time
from pathlib import Path
from typing import Dict, List, Optional
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
except ImportError:
SEMANTIC_AVAILABLE = False
logger = logging.getLogger(__name__)
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
"""Check if an index has embeddings and return statistics.
Args:
index_path: Path to _index.db file
Returns:
Dictionary with embedding statistics and status
"""
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
try:
with sqlite3.connect(index_path) as conn:
# Check if semantic_chunks table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
table_exists = cursor.fetchone() is not None
if not table_exists:
# Count total indexed files even without embeddings
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
return {
"success": True,
"result": {
"has_embeddings": False,
"total_chunks": 0,
"total_files": total_files,
"files_with_chunks": 0,
"files_without_chunks": total_files,
"coverage_percent": 0.0,
"missing_files_sample": [],
"index_path": str(index_path),
},
}
# Count total chunks
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
total_chunks = cursor.fetchone()[0]
# Count total indexed files
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
# Count files with embeddings
cursor = conn.execute(
"SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
)
files_with_chunks = cursor.fetchone()[0]
# Get a sample of files without embeddings
cursor = conn.execute("""
SELECT full_path
FROM files
WHERE full_path NOT IN (
SELECT DISTINCT file_path FROM semantic_chunks
)
LIMIT 5
""")
missing_files = [row[0] for row in cursor.fetchall()]
return {
"success": True,
"result": {
"has_embeddings": total_chunks > 0,
"total_chunks": total_chunks,
"total_files": total_files,
"files_with_chunks": files_with_chunks,
"files_without_chunks": total_files - files_with_chunks,
"coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
"missing_files_sample": missing_files,
"index_path": str(index_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to check embeddings: {str(e)}",
}
def generate_embeddings(
index_path: Path,
model_profile: str = "code",
force: bool = False,
chunk_size: int = 2000,
progress_callback: Optional[callable] = None,
) -> Dict[str, any]:
"""Generate embeddings for an index.
Args:
index_path: Path to _index.db file
model_profile: Model profile (fast, code, multilingual, balanced)
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
progress_callback: Optional callback for progress updates
Returns:
Result dictionary with generation statistics
"""
if not SEMANTIC_AVAILABLE:
return {
"success": False,
"error": "Semantic search not available. Install with: pip install codexlens[semantic]",
}
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
# Check existing chunks
status = check_index_embeddings(index_path)
if not status["success"]:
return status
existing_chunks = status["result"]["total_chunks"]
if existing_chunks > 0 and not force:
return {
"success": False,
"error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
"existing_chunks": existing_chunks,
}
if force and existing_chunks > 0:
if progress_callback:
progress_callback(f"Clearing {existing_chunks} existing chunks...")
try:
with sqlite3.connect(index_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
except Exception as e:
return {
"success": False,
"error": f"Failed to clear existing chunks: {str(e)}",
}
# Initialize components
try:
embedder = Embedder(profile=model_profile)
vector_store = VectorStore(index_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
if progress_callback:
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
except Exception as e:
return {
"success": False,
"error": f"Failed to initialize components: {str(e)}",
}
# Read files from index
try:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT full_path, content, language FROM files")
files = cursor.fetchall()
except Exception as e:
return {
"success": False,
"error": f"Failed to read files: {str(e)}",
}
if len(files) == 0:
return {
"success": False,
"error": "No files found in index",
}
if progress_callback:
progress_callback(f"Processing {len(files)} files...")
# Process each file
total_chunks = 0
failed_files = []
start_time = time.time()
for idx, file_row in enumerate(files, 1):
file_path = file_row["full_path"]
content = file_row["content"]
language = file_row["language"] or "python"
try:
# Create chunks
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
continue
# Generate embeddings
for chunk in chunks:
embedding = embedder.embed_single(chunk.content)
chunk.embedding = embedding
# Store chunks
vector_store.add_chunks(chunks, file_path)
total_chunks += len(chunks)
if progress_callback:
progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
failed_files.append((file_path, str(e)))
elapsed_time = time.time() - start_time
return {
"success": True,
"result": {
"chunks_created": total_chunks,
"files_processed": len(files) - len(failed_files),
"files_failed": len(failed_files),
"elapsed_time": elapsed_time,
"model_profile": model_profile,
"model_name": embedder.model_name,
"failed_files": failed_files[:5], # First 5 failures
"index_path": str(index_path),
},
}
def find_all_indexes(scan_dir: Path) -> List[Path]:
"""Find all _index.db files in directory tree.
Args:
scan_dir: Directory to scan
Returns:
List of paths to _index.db files
"""
if not scan_dir.exists():
return []
return list(scan_dir.rglob("_index.db"))
def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
"""Get summary statistics for all indexes in root directory.
Args:
index_root: Root directory containing indexes
Returns:
Summary statistics for all indexes
"""
indexes = find_all_indexes(index_root)
if not indexes:
return {
"success": True,
"result": {
"total_indexes": 0,
"indexes_with_embeddings": 0,
"total_chunks": 0,
"indexes": [],
},
}
total_chunks = 0
indexes_with_embeddings = 0
index_stats = []
for index_path in indexes:
status = check_index_embeddings(index_path)
if status["success"]:
result = status["result"]
has_emb = result["has_embeddings"]
chunks = result["total_chunks"]
if has_emb:
indexes_with_embeddings += 1
total_chunks += chunks
# Extract project name from path
project_name = index_path.parent.name
index_stats.append({
"project": project_name,
"path": str(index_path),
"has_embeddings": has_emb,
"total_chunks": chunks,
"total_files": result["total_files"],
"coverage_percent": result.get("coverage_percent", 0),
})
return {
"success": True,
"result": {
"total_indexes": len(indexes),
"indexes_with_embeddings": indexes_with_embeddings,
"total_chunks": total_chunks,
"indexes": index_stats,
},
}