mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
279 lines
8.3 KiB
Python
279 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate vector embeddings for existing CodexLens indexes.
|
|
|
|
This script is a CLI wrapper around the memory-efficient streaming implementation
|
|
in codexlens.cli.embedding_manager. It uses batch processing to keep memory usage
|
|
under 2GB regardless of project size.
|
|
|
|
Requirements:
|
|
pip install codexlens[semantic]
|
|
# or
|
|
pip install fastembed numpy hnswlib
|
|
|
|
Usage:
|
|
# Generate embeddings for a single index
|
|
python generate_embeddings.py /path/to/_index.db
|
|
|
|
# Use specific embedding model
|
|
python generate_embeddings.py /path/to/_index.db --model code
|
|
|
|
# Generate centralized embeddings for all indexes in a directory
|
|
python generate_embeddings.py --centralized ~/.codexlens/indexes
|
|
|
|
# Force regeneration
|
|
python generate_embeddings.py /path/to/_index.db --force
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
import warnings
|
|
from pathlib import Path
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Import the memory-efficient implementation
|
|
try:
|
|
from codexlens.cli.embedding_manager import (
|
|
generate_embeddings,
|
|
generate_dense_embeddings_centralized,
|
|
)
|
|
from codexlens.semantic import SEMANTIC_AVAILABLE
|
|
except ImportError as exc:
|
|
logger.error(f"Failed to import codexlens: {exc}")
|
|
logger.error("Make sure codexlens is installed: pip install codexlens")
|
|
SEMANTIC_AVAILABLE = False
|
|
|
|
|
|
def check_dependencies():
|
|
"""Check if semantic search dependencies are available."""
|
|
if not SEMANTIC_AVAILABLE:
|
|
logger.error("Semantic search dependencies not available")
|
|
logger.error("Install with: pip install codexlens[semantic]")
|
|
logger.error("Or: pip install fastembed numpy hnswlib")
|
|
return False
|
|
return True
|
|
|
|
|
|
def progress_callback(message: str):
|
|
"""Callback function for progress updates."""
|
|
logger.info(message)
|
|
|
|
|
|
def generate_embeddings_for_index(
|
|
index_db_path: Path,
|
|
model_profile: str = "code",
|
|
force: bool = False,
|
|
chunk_size: int = 2000,
|
|
**kwargs # Ignore unused parameters (workers, batch_size) for backward compatibility
|
|
) -> dict:
|
|
"""Generate embeddings for an index using memory-efficient streaming.
|
|
|
|
This function wraps the streaming implementation from embedding_manager
|
|
to maintain CLI compatibility while using the memory-optimized approach.
|
|
|
|
Args:
|
|
index_db_path: Path to _index.db file
|
|
model_profile: Model profile to use (fast, code, multilingual, balanced)
|
|
force: If True, regenerate even if embeddings exist
|
|
chunk_size: Maximum chunk size in characters
|
|
**kwargs: Additional parameters (ignored for compatibility)
|
|
|
|
Returns:
|
|
Dictionary with generation statistics
|
|
"""
|
|
logger.info(f"Processing index: {index_db_path}")
|
|
|
|
# Call the memory-efficient streaming implementation
|
|
result = generate_embeddings(
|
|
index_path=index_db_path,
|
|
model_profile=model_profile,
|
|
force=force,
|
|
chunk_size=chunk_size,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if not result["success"]:
|
|
if "error" in result:
|
|
logger.error(result["error"])
|
|
return result
|
|
|
|
# Extract result data and log summary
|
|
data = result["result"]
|
|
logger.info("=" * 60)
|
|
logger.info(f"Completed in {data['elapsed_time']:.1f}s")
|
|
logger.info(f"Total chunks created: {data['chunks_created']}")
|
|
logger.info(f"Files processed: {data['files_processed']}")
|
|
if data['files_failed'] > 0:
|
|
logger.warning(f"Failed files: {data['files_failed']}")
|
|
if data.get('failed_files'):
|
|
for file_path, error in data['failed_files']:
|
|
logger.warning(f" {file_path}: {error}")
|
|
|
|
return {
|
|
"success": True,
|
|
"chunks_created": data["chunks_created"],
|
|
"files_processed": data["files_processed"],
|
|
"files_failed": data["files_failed"],
|
|
"elapsed_time": data["elapsed_time"],
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate vector embeddings for CodexLens indexes (memory-efficient streaming)",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
|
|
parser.add_argument(
|
|
"index_path",
|
|
type=Path,
|
|
help="Path to _index.db file or directory for centralized mode"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--centralized",
|
|
"-c",
|
|
action="store_true",
|
|
help="Use centralized vector storage (single HNSW index at project root)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--scan",
|
|
action="store_true",
|
|
help="(Deprecated) Use --centralized instead"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default="code",
|
|
choices=["fast", "code", "multilingual", "balanced"],
|
|
help="Embedding model profile (default: code)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--chunk-size",
|
|
type=int,
|
|
default=2000,
|
|
help="Maximum chunk size in characters (default: 2000)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=0,
|
|
help="(Deprecated) Kept for backward compatibility, ignored"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=256,
|
|
help="(Deprecated) Kept for backward compatibility, ignored"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--force",
|
|
action="store_true",
|
|
help="Regenerate embeddings even if they exist"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable verbose logging"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging level
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Check dependencies
|
|
if not check_dependencies():
|
|
sys.exit(1)
|
|
|
|
# Resolve path
|
|
index_path = args.index_path.expanduser().resolve()
|
|
|
|
if not index_path.exists():
|
|
logger.error(f"Path not found: {index_path}")
|
|
sys.exit(1)
|
|
|
|
# Handle deprecated --scan flag
|
|
use_centralized = args.centralized
|
|
if args.scan:
|
|
warnings.warn(
|
|
"--scan is deprecated, use --centralized instead",
|
|
DeprecationWarning
|
|
)
|
|
logger.warning("--scan is deprecated. Use --centralized instead.")
|
|
use_centralized = True
|
|
|
|
# Determine if using centralized mode or single file
|
|
if use_centralized or index_path.is_dir():
|
|
# Centralized mode - single HNSW index at project root
|
|
if index_path.is_file():
|
|
logger.error("--centralized requires a directory path")
|
|
sys.exit(1)
|
|
|
|
logger.info(f"Generating centralized embeddings for: {index_path}")
|
|
result = generate_dense_embeddings_centralized(
|
|
index_root=index_path,
|
|
model_profile=args.model,
|
|
force=args.force,
|
|
chunk_size=args.chunk_size,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if not result["success"]:
|
|
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
|
sys.exit(1)
|
|
|
|
# Log summary
|
|
data = result["result"]
|
|
logger.info(f"\n{'='*60}")
|
|
logger.info("CENTRALIZED EMBEDDING COMPLETE")
|
|
logger.info(f"{'='*60}")
|
|
logger.info(f"Total chunks created: {data['chunks_created']}")
|
|
logger.info(f"Total files processed: {data['files_processed']}")
|
|
if data.get('files_failed', 0) > 0:
|
|
logger.warning(f"Total files failed: {data['files_failed']}")
|
|
logger.info(f"Central index: {data.get('central_index_path', 'N/A')}")
|
|
logger.info(f"Time: {data.get('elapsed_time', 0):.1f}s")
|
|
|
|
else:
|
|
# Single index mode
|
|
if not index_path.name.endswith("_index.db"):
|
|
logger.error("File must be named '_index.db'")
|
|
sys.exit(1)
|
|
|
|
result = generate_embeddings_for_index(
|
|
index_path,
|
|
model_profile=args.model,
|
|
force=args.force,
|
|
chunk_size=args.chunk_size,
|
|
)
|
|
|
|
if not result["success"]:
|
|
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
|
sys.exit(1)
|
|
|
|
logger.info("\nv Embeddings generation complete!")
|
|
logger.info("\nYou can now use vector search:")
|
|
logger.info(" codexlens search 'your query' --mode pure-vector")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|