Files
Claude-Code-Workflow/codex-lens/scripts/generate_embeddings.py

279 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""Generate vector embeddings for existing CodexLens indexes.
This script is a CLI wrapper around the memory-efficient streaming implementation
in codexlens.cli.embedding_manager. It uses batch processing to keep memory usage
under 2GB regardless of project size.
Requirements:
pip install codexlens[semantic]
# or
pip install fastembed numpy hnswlib
Usage:
# Generate embeddings for a single index
python generate_embeddings.py /path/to/_index.db
# Use specific embedding model
python generate_embeddings.py /path/to/_index.db --model code
# Generate centralized embeddings for all indexes in a directory
python generate_embeddings.py --centralized ~/.codexlens/indexes
# Force regeneration
python generate_embeddings.py /path/to/_index.db --force
"""
import argparse
import logging
import sys
import warnings
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)
# Import the memory-efficient implementation
try:
from codexlens.cli.embedding_manager import (
generate_embeddings,
generate_dense_embeddings_centralized,
)
from codexlens.semantic import SEMANTIC_AVAILABLE
except ImportError as exc:
logger.error(f"Failed to import codexlens: {exc}")
logger.error("Make sure codexlens is installed: pip install codexlens")
SEMANTIC_AVAILABLE = False
def check_dependencies():
"""Check if semantic search dependencies are available."""
if not SEMANTIC_AVAILABLE:
logger.error("Semantic search dependencies not available")
logger.error("Install with: pip install codexlens[semantic]")
logger.error("Or: pip install fastembed numpy hnswlib")
return False
return True
def progress_callback(message: str):
"""Callback function for progress updates."""
logger.info(message)
def generate_embeddings_for_index(
index_db_path: Path,
model_profile: str = "code",
force: bool = False,
chunk_size: int = 2000,
**kwargs # Ignore unused parameters (workers, batch_size) for backward compatibility
) -> dict:
"""Generate embeddings for an index using memory-efficient streaming.
This function wraps the streaming implementation from embedding_manager
to maintain CLI compatibility while using the memory-optimized approach.
Args:
index_db_path: Path to _index.db file
model_profile: Model profile to use (fast, code, multilingual, balanced)
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
**kwargs: Additional parameters (ignored for compatibility)
Returns:
Dictionary with generation statistics
"""
logger.info(f"Processing index: {index_db_path}")
# Call the memory-efficient streaming implementation
result = generate_embeddings(
index_path=index_db_path,
model_profile=model_profile,
force=force,
chunk_size=chunk_size,
progress_callback=progress_callback,
)
if not result["success"]:
if "error" in result:
logger.error(result["error"])
return result
# Extract result data and log summary
data = result["result"]
logger.info("=" * 60)
logger.info(f"Completed in {data['elapsed_time']:.1f}s")
logger.info(f"Total chunks created: {data['chunks_created']}")
logger.info(f"Files processed: {data['files_processed']}")
if data['files_failed'] > 0:
logger.warning(f"Failed files: {data['files_failed']}")
if data.get('failed_files'):
for file_path, error in data['failed_files']:
logger.warning(f" {file_path}: {error}")
return {
"success": True,
"chunks_created": data["chunks_created"],
"files_processed": data["files_processed"],
"files_failed": data["files_failed"],
"elapsed_time": data["elapsed_time"],
}
def main():
parser = argparse.ArgumentParser(
description="Generate vector embeddings for CodexLens indexes (memory-efficient streaming)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
"index_path",
type=Path,
help="Path to _index.db file or directory for centralized mode"
)
parser.add_argument(
"--centralized",
"-c",
action="store_true",
help="Use centralized vector storage (single HNSW index at project root)"
)
parser.add_argument(
"--scan",
action="store_true",
help="(Deprecated) Use --centralized instead"
)
parser.add_argument(
"--model",
type=str,
default="code",
choices=["fast", "code", "multilingual", "balanced"],
help="Embedding model profile (default: code)"
)
parser.add_argument(
"--chunk-size",
type=int,
default=2000,
help="Maximum chunk size in characters (default: 2000)"
)
parser.add_argument(
"--workers",
type=int,
default=0,
help="(Deprecated) Kept for backward compatibility, ignored"
)
parser.add_argument(
"--batch-size",
type=int,
default=256,
help="(Deprecated) Kept for backward compatibility, ignored"
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate embeddings even if they exist"
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Configure logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Check dependencies
if not check_dependencies():
sys.exit(1)
# Resolve path
index_path = args.index_path.expanduser().resolve()
if not index_path.exists():
logger.error(f"Path not found: {index_path}")
sys.exit(1)
# Handle deprecated --scan flag
use_centralized = args.centralized
if args.scan:
warnings.warn(
"--scan is deprecated, use --centralized instead",
DeprecationWarning
)
logger.warning("--scan is deprecated. Use --centralized instead.")
use_centralized = True
# Determine if using centralized mode or single file
if use_centralized or index_path.is_dir():
# Centralized mode - single HNSW index at project root
if index_path.is_file():
logger.error("--centralized requires a directory path")
sys.exit(1)
logger.info(f"Generating centralized embeddings for: {index_path}")
result = generate_dense_embeddings_centralized(
index_root=index_path,
model_profile=args.model,
force=args.force,
chunk_size=args.chunk_size,
progress_callback=progress_callback,
)
if not result["success"]:
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
sys.exit(1)
# Log summary
data = result["result"]
logger.info(f"\n{'='*60}")
logger.info("CENTRALIZED EMBEDDING COMPLETE")
logger.info(f"{'='*60}")
logger.info(f"Total chunks created: {data['chunks_created']}")
logger.info(f"Total files processed: {data['files_processed']}")
if data.get('files_failed', 0) > 0:
logger.warning(f"Total files failed: {data['files_failed']}")
logger.info(f"Central index: {data.get('central_index_path', 'N/A')}")
logger.info(f"Time: {data.get('elapsed_time', 0):.1f}s")
else:
# Single index mode
if not index_path.name.endswith("_index.db"):
logger.error("File must be named '_index.db'")
sys.exit(1)
result = generate_embeddings_for_index(
index_path,
model_profile=args.model,
force=args.force,
chunk_size=args.chunk_size,
)
if not result["success"]:
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
sys.exit(1)
logger.info("\nv Embeddings generation complete!")
logger.info("\nYou can now use vector search:")
logger.info(" codexlens search 'your query' --mode pure-vector")
if __name__ == "__main__":
main()