mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-06 01:54:11 +08:00
- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
364 lines
11 KiB
Python
364 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate vector embeddings for existing CodexLens indexes.
|
|
|
|
This script processes all files in a CodexLens index database and generates
|
|
semantic vector embeddings for code chunks. The embeddings are stored in the
|
|
same SQLite database in the 'semantic_chunks' table.
|
|
|
|
Requirements:
|
|
pip install codexlens[semantic]
|
|
# or
|
|
pip install fastembed numpy
|
|
|
|
Usage:
|
|
# Generate embeddings for a single index
|
|
python generate_embeddings.py /path/to/_index.db
|
|
|
|
# Generate embeddings for all indexes in a directory
|
|
python generate_embeddings.py --scan ~/.codexlens/indexes
|
|
|
|
# Use specific embedding model
|
|
python generate_embeddings.py /path/to/_index.db --model code
|
|
|
|
# Batch processing with progress
|
|
find ~/.codexlens/indexes -name "_index.db" | xargs -I {} python generate_embeddings.py {}
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def check_dependencies():
|
|
"""Check if semantic search dependencies are available."""
|
|
try:
|
|
from codexlens.semantic import SEMANTIC_AVAILABLE
|
|
if not SEMANTIC_AVAILABLE:
|
|
logger.error("Semantic search dependencies not available")
|
|
logger.error("Install with: pip install codexlens[semantic]")
|
|
logger.error("Or: pip install fastembed numpy")
|
|
return False
|
|
return True
|
|
except ImportError as exc:
|
|
logger.error(f"Failed to import codexlens: {exc}")
|
|
logger.error("Make sure codexlens is installed: pip install codexlens")
|
|
return False
|
|
|
|
|
|
def count_files(index_db_path: Path) -> int:
|
|
"""Count total files in index."""
|
|
try:
|
|
with sqlite3.connect(index_db_path) as conn:
|
|
cursor = conn.execute("SELECT COUNT(*) FROM files")
|
|
return cursor.fetchone()[0]
|
|
except Exception as exc:
|
|
logger.error(f"Failed to count files: {exc}")
|
|
return 0
|
|
|
|
|
|
def check_existing_chunks(index_db_path: Path) -> int:
|
|
"""Check if semantic chunks already exist."""
|
|
try:
|
|
with sqlite3.connect(index_db_path) as conn:
|
|
# Check if table exists
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
|
)
|
|
if not cursor.fetchone():
|
|
return 0
|
|
|
|
# Count existing chunks
|
|
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
|
return cursor.fetchone()[0]
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def generate_embeddings_for_index(
|
|
index_db_path: Path,
|
|
model_profile: str = "code",
|
|
force: bool = False,
|
|
chunk_size: int = 2000,
|
|
) -> dict:
|
|
"""Generate embeddings for all files in an index.
|
|
|
|
Args:
|
|
index_db_path: Path to _index.db file
|
|
model_profile: Model profile to use (fast, code, multilingual, balanced)
|
|
force: If True, regenerate even if embeddings exist
|
|
chunk_size: Maximum chunk size in characters
|
|
|
|
Returns:
|
|
Dictionary with generation statistics
|
|
"""
|
|
logger.info(f"Processing index: {index_db_path}")
|
|
|
|
# Check existing chunks
|
|
existing_chunks = check_existing_chunks(index_db_path)
|
|
if existing_chunks > 0 and not force:
|
|
logger.warning(f"Index already has {existing_chunks} chunks")
|
|
logger.warning("Use --force to regenerate")
|
|
return {
|
|
"success": False,
|
|
"error": "Embeddings already exist",
|
|
"existing_chunks": existing_chunks,
|
|
}
|
|
|
|
if force and existing_chunks > 0:
|
|
logger.info(f"Force mode: clearing {existing_chunks} existing chunks")
|
|
try:
|
|
with sqlite3.connect(index_db_path) as conn:
|
|
conn.execute("DELETE FROM semantic_chunks")
|
|
conn.commit()
|
|
except Exception as exc:
|
|
logger.error(f"Failed to clear existing chunks: {exc}")
|
|
|
|
# Import dependencies
|
|
try:
|
|
from codexlens.semantic.embedder import Embedder
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
|
except ImportError as exc:
|
|
return {
|
|
"success": False,
|
|
"error": f"Import failed: {exc}",
|
|
}
|
|
|
|
# Initialize components
|
|
try:
|
|
embedder = Embedder(profile=model_profile)
|
|
vector_store = VectorStore(index_db_path)
|
|
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
|
|
|
logger.info(f"Using model: {embedder.model_name}")
|
|
logger.info(f"Embedding dimension: {embedder.embedding_dim}")
|
|
except Exception as exc:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to initialize components: {exc}",
|
|
}
|
|
|
|
# Read files from index
|
|
try:
|
|
with sqlite3.connect(index_db_path) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
|
files = cursor.fetchall()
|
|
except Exception as exc:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to read files: {exc}",
|
|
}
|
|
|
|
logger.info(f"Found {len(files)} files to process")
|
|
if len(files) == 0:
|
|
return {
|
|
"success": False,
|
|
"error": "No files found in index",
|
|
}
|
|
|
|
# Process each file
|
|
total_chunks = 0
|
|
failed_files = []
|
|
start_time = time.time()
|
|
|
|
for idx, file_row in enumerate(files, 1):
|
|
file_path = file_row["full_path"]
|
|
content = file_row["content"]
|
|
language = file_row["language"] or "python"
|
|
|
|
try:
|
|
# Create chunks using sliding window
|
|
chunks = chunker.chunk_sliding_window(
|
|
content,
|
|
file_path=file_path,
|
|
language=language
|
|
)
|
|
|
|
if not chunks:
|
|
logger.debug(f"[{idx}/{len(files)}] {file_path}: No chunks created")
|
|
continue
|
|
|
|
# Generate embeddings
|
|
for chunk in chunks:
|
|
embedding = embedder.embed_single(chunk.content)
|
|
chunk.embedding = embedding
|
|
|
|
# Store chunks
|
|
vector_store.add_chunks(chunks, file_path)
|
|
total_chunks += len(chunks)
|
|
|
|
logger.info(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
|
|
|
|
except Exception as exc:
|
|
logger.error(f"[{idx}/{len(files)}] {file_path}: ERROR - {exc}")
|
|
failed_files.append((file_path, str(exc)))
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
|
# Generate summary
|
|
logger.info("=" * 60)
|
|
logger.info(f"Completed in {elapsed_time:.1f}s")
|
|
logger.info(f"Total chunks created: {total_chunks}")
|
|
logger.info(f"Files processed: {len(files) - len(failed_files)}/{len(files)}")
|
|
if failed_files:
|
|
logger.warning(f"Failed files: {len(failed_files)}")
|
|
for file_path, error in failed_files[:5]: # Show first 5 failures
|
|
logger.warning(f" {file_path}: {error}")
|
|
|
|
return {
|
|
"success": True,
|
|
"chunks_created": total_chunks,
|
|
"files_processed": len(files) - len(failed_files),
|
|
"files_failed": len(failed_files),
|
|
"elapsed_time": elapsed_time,
|
|
}
|
|
|
|
|
|
def find_index_databases(scan_dir: Path) -> List[Path]:
|
|
"""Find all _index.db files in directory tree."""
|
|
logger.info(f"Scanning for indexes in: {scan_dir}")
|
|
index_files = list(scan_dir.rglob("_index.db"))
|
|
logger.info(f"Found {len(index_files)} index databases")
|
|
return index_files
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate vector embeddings for CodexLens indexes",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
|
|
parser.add_argument(
|
|
"index_path",
|
|
type=Path,
|
|
help="Path to _index.db file or directory to scan"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--scan",
|
|
action="store_true",
|
|
help="Scan directory tree for all _index.db files"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default="code",
|
|
choices=["fast", "code", "multilingual", "balanced"],
|
|
help="Embedding model profile (default: code)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--chunk-size",
|
|
type=int,
|
|
default=2000,
|
|
help="Maximum chunk size in characters (default: 2000)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--force",
|
|
action="store_true",
|
|
help="Regenerate embeddings even if they exist"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable verbose logging"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging level
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Check dependencies
|
|
if not check_dependencies():
|
|
sys.exit(1)
|
|
|
|
# Resolve path
|
|
index_path = args.index_path.expanduser().resolve()
|
|
|
|
if not index_path.exists():
|
|
logger.error(f"Path not found: {index_path}")
|
|
sys.exit(1)
|
|
|
|
# Determine if scanning or single file
|
|
if args.scan or index_path.is_dir():
|
|
# Scan mode
|
|
if index_path.is_file():
|
|
logger.error("--scan requires a directory path")
|
|
sys.exit(1)
|
|
|
|
index_files = find_index_databases(index_path)
|
|
if not index_files:
|
|
logger.error(f"No index databases found in: {index_path}")
|
|
sys.exit(1)
|
|
|
|
# Process each index
|
|
total_chunks = 0
|
|
successful = 0
|
|
for idx, index_file in enumerate(index_files, 1):
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info(f"Processing index {idx}/{len(index_files)}")
|
|
logger.info(f"{'='*60}")
|
|
|
|
result = generate_embeddings_for_index(
|
|
index_file,
|
|
model_profile=args.model,
|
|
force=args.force,
|
|
chunk_size=args.chunk_size,
|
|
)
|
|
|
|
if result["success"]:
|
|
total_chunks += result["chunks_created"]
|
|
successful += 1
|
|
|
|
# Final summary
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("BATCH PROCESSING COMPLETE")
|
|
logger.info(f"{'='*60}")
|
|
logger.info(f"Indexes processed: {successful}/{len(index_files)}")
|
|
logger.info(f"Total chunks created: {total_chunks}")
|
|
|
|
else:
|
|
# Single index mode
|
|
if not index_path.name.endswith("_index.db"):
|
|
logger.error("File must be named '_index.db'")
|
|
sys.exit(1)
|
|
|
|
result = generate_embeddings_for_index(
|
|
index_path,
|
|
model_profile=args.model,
|
|
force=args.force,
|
|
chunk_size=args.chunk_size,
|
|
)
|
|
|
|
if not result["success"]:
|
|
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
|
sys.exit(1)
|
|
|
|
logger.info("\n✓ Embeddings generation complete!")
|
|
logger.info("\nYou can now use vector search:")
|
|
logger.info(" codexlens search 'your query' --mode pure-vector")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|