Claude-Code-Workflow/.claude/python_script/indexer.py

#!/usr/bin/env python3
"""
File Structure Indexer
Builds and maintains file indices for intelligent analysis.
"""

import sys
import argparse
import logging
import json
import time
from pathlib import Path
from typing import Dict, List, Optional, Any

# Add current directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from core.config import get_config
from core.file_indexer import FileIndexer, IndexStats
from core.embedding_manager import EmbeddingManager
from utils.colors import Colors


class ProjectIndexer:
    """Manages file indexing and project statistics."""

    def __init__(self, config_path: Optional[str] = None, root_path: str = "."):
        self.root_path = Path(root_path).resolve()
        self.config = get_config(config_path)

        # Setup logging
        logging.basicConfig(
            level=getattr(logging, self.config.get('logging.level', 'INFO')),
            format=self.config.get('logging.format', '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        )
        self.logger = logging.getLogger(__name__)

        # Initialize core components
        self.indexer = FileIndexer(self.config, str(self.root_path))

        # Initialize embedding manager if enabled
        self.embedding_manager = None
        if self.config.is_embedding_enabled():
            try:
                self.embedding_manager = EmbeddingManager(self.config)
            except ImportError:
                self.logger.warning("Embedding dependencies not available. Install sentence-transformers for enhanced functionality.")

    def build_index(self) -> IndexStats:
        """Build or update the file index."""
        print(Colors.yellow("Building file index..."))
        start_time = time.time()

        self.indexer.build_index()
        stats = self.indexer.get_stats()

        elapsed = time.time() - start_time
        if stats:
            print(Colors.green(f"Index built: {stats.total_files} files, ~{stats.total_tokens:,} tokens ({elapsed:.2f}s)"))
        else:
            print(Colors.green(f"Index built successfully ({elapsed:.2f}s)"))

        return stats

    def update_embeddings(self) -> bool:
        """Update embeddings for semantic similarity."""
        if not self.embedding_manager:
            print(Colors.error("Embedding functionality not available"))
            return False

        print(Colors.yellow("Updating embeddings..."))
        start_time = time.time()

        # Load file index
        index = self.indexer.load_index()
        if not index:
            print(Colors.warning("No file index found. Building index first..."))
            self.build_index()
            index = self.indexer.load_index()

        try:
            count = self.embedding_manager.update_embeddings(index)
            elapsed = time.time() - start_time
            print(Colors.green(f"Updated {count} embeddings ({elapsed:.2f}s)"))
            return True
        except Exception as e:
            print(Colors.error(f"Failed to update embeddings: {e}"))
            return False

    def get_project_stats(self) -> Dict[str, Any]:
        """Get comprehensive project statistics."""
        stats = self.indexer.get_stats()
        embedding_stats = {}

        if self.embedding_manager:
            embedding_stats = {
                'embeddings_exist': self.embedding_manager.embeddings_exist(),
                'embedding_count': len(self.embedding_manager._load_embedding_cache()) if self.embedding_manager.embeddings_exist() else 0
            }

        project_size = self._classify_project_size(stats.total_tokens if stats else 0)

        return {
            'files': stats.total_files if stats else 0,
            'tokens': stats.total_tokens if stats else 0,
            'size_bytes': stats.total_size if stats else 0,
            'categories': stats.categories if stats else {},
            'project_size': project_size,
            'last_updated': stats.last_updated if stats else 0,
            'embeddings': embedding_stats,
            'config': {
                'cache_dir': self.config.get_cache_dir(),
                'embedding_enabled': self.config.is_embedding_enabled(),
                'exclude_patterns_count': len(self.config.get_exclude_patterns())
            }
        }

    def _classify_project_size(self, tokens: int) -> str:
        """Classify project size based on token count."""
        small_limit = self.config.get('token_limits.small_project', 500000)
        medium_limit = self.config.get('token_limits.medium_project', 2000000)

        if tokens < small_limit:
            return "small"
        elif tokens < medium_limit:
            return "medium"
        else:
            return "large"

    def cleanup_cache(self):
        """Clean up old cache files."""
        cache_dir = Path(self.config.get_cache_dir())
        if cache_dir.exists():
            print(Colors.yellow("Cleaning up cache..."))
            for file in cache_dir.glob("*"):
                if file.is_file():
                    file.unlink()
                    print(f"Removed: {file}")
            print(Colors.green("Cache cleaned"))


def main():
    """CLI entry point for indexer."""
    parser = argparse.ArgumentParser(
        description="Project File Indexer - Build and manage file indices",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python indexer.py --build           # Build file index
  python indexer.py --stats           # Show project statistics
  python indexer.py --embeddings      # Update embeddings
  python indexer.py --cleanup         # Clean cache
        """
    )

    parser.add_argument('--build', action='store_true', help='Build file index')
    parser.add_argument('--stats', action='store_true', help='Show project statistics')
    parser.add_argument('--embeddings', action='store_true', help='Update embeddings')
    parser.add_argument('--cleanup', action='store_true', help='Clean up cache files')
    parser.add_argument('--output', choices=['json', 'text'], default='text', help='Output format')
    parser.add_argument('--config', help='Configuration file path')
    parser.add_argument('--root', default='.', help='Root directory to analyze')

    args = parser.parse_args()

    # Require at least one action
    if not any([args.build, args.stats, args.embeddings, args.cleanup]):
        parser.error("At least one action is required: --build, --stats, --embeddings, or --cleanup")

    # Create indexer
    indexer = ProjectIndexer(args.config, args.root)

    try:
        if args.cleanup:
            indexer.cleanup_cache()

        if args.build:
            indexer.build_index()

        if args.embeddings:
            indexer.update_embeddings()

        if args.stats:
            stats = indexer.get_project_stats()
            if args.output == 'json':
                print(json.dumps(stats, indent=2, default=str))
            else:
                print(f"Total files: {stats['files']}")
                print(f"Total tokens: {stats['tokens']:,}")
                print(f"Project size: {stats['project_size']}")
                print(f"Categories: {stats['categories']}")
                if 'embeddings' in stats:
                    print(f"Embeddings: {stats['embeddings']['embedding_count']}")

    except KeyboardInterrupt:
        print(Colors.warning("\nOperation interrupted by user"))
        sys.exit(1)
    except Exception as e:
        print(Colors.error(f"Operation failed: {e}"))
        sys.exit(1)


if __name__ == "__main__":
    main()