Files
Claude-Code-Workflow/codex-lens/scripts/generate_embeddings.py
catlog22 df23975a0b Add comprehensive tests for schema cleanup migration and search comparison
- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2025-12-16 19:27:05 +08:00

364 lines
11 KiB
Python

#!/usr/bin/env python3
"""Generate vector embeddings for existing CodexLens indexes.
This script processes all files in a CodexLens index database and generates
semantic vector embeddings for code chunks. The embeddings are stored in the
same SQLite database in the 'semantic_chunks' table.
Requirements:
pip install codexlens[semantic]
# or
pip install fastembed numpy
Usage:
# Generate embeddings for a single index
python generate_embeddings.py /path/to/_index.db
# Generate embeddings for all indexes in a directory
python generate_embeddings.py --scan ~/.codexlens/indexes
# Use specific embedding model
python generate_embeddings.py /path/to/_index.db --model code
# Batch processing with progress
find ~/.codexlens/indexes -name "_index.db" | xargs -I {} python generate_embeddings.py {}
"""
import argparse
import logging
import sqlite3
import sys
import time
from pathlib import Path
from typing import List, Optional
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)
def check_dependencies():
"""Check if semantic search dependencies are available."""
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
logger.error("Semantic search dependencies not available")
logger.error("Install with: pip install codexlens[semantic]")
logger.error("Or: pip install fastembed numpy")
return False
return True
except ImportError as exc:
logger.error(f"Failed to import codexlens: {exc}")
logger.error("Make sure codexlens is installed: pip install codexlens")
return False
def count_files(index_db_path: Path) -> int:
"""Count total files in index."""
try:
with sqlite3.connect(index_db_path) as conn:
cursor = conn.execute("SELECT COUNT(*) FROM files")
return cursor.fetchone()[0]
except Exception as exc:
logger.error(f"Failed to count files: {exc}")
return 0
def check_existing_chunks(index_db_path: Path) -> int:
"""Check if semantic chunks already exist."""
try:
with sqlite3.connect(index_db_path) as conn:
# Check if table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
if not cursor.fetchone():
return 0
# Count existing chunks
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
return cursor.fetchone()[0]
except Exception:
return 0
def generate_embeddings_for_index(
index_db_path: Path,
model_profile: str = "code",
force: bool = False,
chunk_size: int = 2000,
) -> dict:
"""Generate embeddings for all files in an index.
Args:
index_db_path: Path to _index.db file
model_profile: Model profile to use (fast, code, multilingual, balanced)
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
Returns:
Dictionary with generation statistics
"""
logger.info(f"Processing index: {index_db_path}")
# Check existing chunks
existing_chunks = check_existing_chunks(index_db_path)
if existing_chunks > 0 and not force:
logger.warning(f"Index already has {existing_chunks} chunks")
logger.warning("Use --force to regenerate")
return {
"success": False,
"error": "Embeddings already exist",
"existing_chunks": existing_chunks,
}
if force and existing_chunks > 0:
logger.info(f"Force mode: clearing {existing_chunks} existing chunks")
try:
with sqlite3.connect(index_db_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
except Exception as exc:
logger.error(f"Failed to clear existing chunks: {exc}")
# Import dependencies
try:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
except ImportError as exc:
return {
"success": False,
"error": f"Import failed: {exc}",
}
# Initialize components
try:
embedder = Embedder(profile=model_profile)
vector_store = VectorStore(index_db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
logger.info(f"Using model: {embedder.model_name}")
logger.info(f"Embedding dimension: {embedder.embedding_dim}")
except Exception as exc:
return {
"success": False,
"error": f"Failed to initialize components: {exc}",
}
# Read files from index
try:
with sqlite3.connect(index_db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT full_path, content, language FROM files")
files = cursor.fetchall()
except Exception as exc:
return {
"success": False,
"error": f"Failed to read files: {exc}",
}
logger.info(f"Found {len(files)} files to process")
if len(files) == 0:
return {
"success": False,
"error": "No files found in index",
}
# Process each file
total_chunks = 0
failed_files = []
start_time = time.time()
for idx, file_row in enumerate(files, 1):
file_path = file_row["full_path"]
content = file_row["content"]
language = file_row["language"] or "python"
try:
# Create chunks using sliding window
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
logger.debug(f"[{idx}/{len(files)}] {file_path}: No chunks created")
continue
# Generate embeddings
for chunk in chunks:
embedding = embedder.embed_single(chunk.content)
chunk.embedding = embedding
# Store chunks
vector_store.add_chunks(chunks, file_path)
total_chunks += len(chunks)
logger.info(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
except Exception as exc:
logger.error(f"[{idx}/{len(files)}] {file_path}: ERROR - {exc}")
failed_files.append((file_path, str(exc)))
elapsed_time = time.time() - start_time
# Generate summary
logger.info("=" * 60)
logger.info(f"Completed in {elapsed_time:.1f}s")
logger.info(f"Total chunks created: {total_chunks}")
logger.info(f"Files processed: {len(files) - len(failed_files)}/{len(files)}")
if failed_files:
logger.warning(f"Failed files: {len(failed_files)}")
for file_path, error in failed_files[:5]: # Show first 5 failures
logger.warning(f" {file_path}: {error}")
return {
"success": True,
"chunks_created": total_chunks,
"files_processed": len(files) - len(failed_files),
"files_failed": len(failed_files),
"elapsed_time": elapsed_time,
}
def find_index_databases(scan_dir: Path) -> List[Path]:
"""Find all _index.db files in directory tree."""
logger.info(f"Scanning for indexes in: {scan_dir}")
index_files = list(scan_dir.rglob("_index.db"))
logger.info(f"Found {len(index_files)} index databases")
return index_files
def main():
parser = argparse.ArgumentParser(
description="Generate vector embeddings for CodexLens indexes",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
"index_path",
type=Path,
help="Path to _index.db file or directory to scan"
)
parser.add_argument(
"--scan",
action="store_true",
help="Scan directory tree for all _index.db files"
)
parser.add_argument(
"--model",
type=str,
default="code",
choices=["fast", "code", "multilingual", "balanced"],
help="Embedding model profile (default: code)"
)
parser.add_argument(
"--chunk-size",
type=int,
default=2000,
help="Maximum chunk size in characters (default: 2000)"
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate embeddings even if they exist"
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Configure logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Check dependencies
if not check_dependencies():
sys.exit(1)
# Resolve path
index_path = args.index_path.expanduser().resolve()
if not index_path.exists():
logger.error(f"Path not found: {index_path}")
sys.exit(1)
# Determine if scanning or single file
if args.scan or index_path.is_dir():
# Scan mode
if index_path.is_file():
logger.error("--scan requires a directory path")
sys.exit(1)
index_files = find_index_databases(index_path)
if not index_files:
logger.error(f"No index databases found in: {index_path}")
sys.exit(1)
# Process each index
total_chunks = 0
successful = 0
for idx, index_file in enumerate(index_files, 1):
logger.info(f"\n{'='*60}")
logger.info(f"Processing index {idx}/{len(index_files)}")
logger.info(f"{'='*60}")
result = generate_embeddings_for_index(
index_file,
model_profile=args.model,
force=args.force,
chunk_size=args.chunk_size,
)
if result["success"]:
total_chunks += result["chunks_created"]
successful += 1
# Final summary
logger.info(f"\n{'='*60}")
logger.info("BATCH PROCESSING COMPLETE")
logger.info(f"{'='*60}")
logger.info(f"Indexes processed: {successful}/{len(index_files)}")
logger.info(f"Total chunks created: {total_chunks}")
else:
# Single index mode
if not index_path.name.endswith("_index.db"):
logger.error("File must be named '_index.db'")
sys.exit(1)
result = generate_embeddings_for_index(
index_path,
model_profile=args.model,
force=args.force,
chunk_size=args.chunk_size,
)
if not result["success"]:
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
sys.exit(1)
logger.info("\n✓ Embeddings generation complete!")
logger.info("\nYou can now use vector search:")
logger.info(" codexlens search 'your query' --mode pure-vector")
if __name__ == "__main__":
main()