Remove LLM enhancement features and related components as per user request. This includes the deletion of source code files, CLI commands, front-end components, tests, scripts, and documentation associated with LLM functionality. Simplified dependencies and reduced complexity while retaining core vector search capabilities. Validation of changes confirmed successful removal and functionality.

2026-02-10 02:24:35 +08:00 · 2025-12-16 21:38:27 +08:00
parent d21066c282
commit b702791c2c
21 changed files with 375 additions and 7193 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -1047,184 +1047,6 @@ def migrate(
            registry.close()


-@app.command()
-def enhance(
-    path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to enhance."),
-    tool: str = typer.Option("gemini", "--tool", "-t", help="LLM tool to use (gemini or qwen)."),
-    batch_size: int = typer.Option(5, "--batch-size", "-b", min=1, max=20, help="Number of files to process per batch."),
-    force: bool = typer.Option(False, "--force", "-f", help="Regenerate metadata for all files, even if already exists."),
-    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
-    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
-) -> None:
-    """Generate LLM-enhanced semantic metadata for indexed files.
-
-    Uses CCW CLI to generate summaries, keywords, and purpose descriptions.
-    Requires ccw to be installed and accessible in PATH.
-    """
-    _configure_logging(verbose)
-    base_path = path.expanduser().resolve()
-
-    registry: RegistryStore | None = None
-    try:
-        # Check if ccw is available
-        import subprocess
-        import shutil
-        import sys
-        try:
-            ccw_cmd = shutil.which("ccw")
-            if not ccw_cmd:
-                raise FileNotFoundError("ccw not in PATH")
-            # On Windows, .cmd files need shell=True
-            if sys.platform == "win32":
-                subprocess.run("ccw --version", shell=True, capture_output=True, check=True)
-            else:
-                subprocess.run(["ccw", "--version"], capture_output=True, check=True)
-        except (subprocess.CalledProcessError, FileNotFoundError):
-            raise CodexLensError("ccw CLI not found. Please install ccw first.")
-
-        # Validate tool
-        if tool not in ("gemini", "qwen"):
-            raise CodexLensError(f"Invalid tool: {tool}. Must be 'gemini' or 'qwen'.")
-
-        registry = RegistryStore()
-        registry.initialize()
-        mapper = PathMapper()
-
-        # Find project
-        project_info = registry.get_project(base_path)
-        if not project_info:
-            raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
-
-        # Import LLM enhancer
-        try:
-            from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig
-        except ImportError as e:
-            raise CodexLensError(f"Semantic enhancement requires additional dependencies: {e}")
-
-        # Create enhancer with config
-        config = LLMConfig(tool=tool, batch_size=batch_size)
-        enhancer = LLMEnhancer(config=config)
-
-        # Get index directory
-        index_dir = mapper.source_to_index_dir(base_path)
-        if not index_dir.exists():
-            raise CodexLensError(f"Index directory not found: {index_dir}")
-
-        # Process all index databases recursively
-        from codexlens.storage.dir_index import DirIndexStore
-        from pathlib import Path
-
-        total_processed = 0
-        total_errors = 0
-
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            # Find all _index.db files
-            index_files = list(index_dir.rglob("_index.db"))
-            task = progress.add_task(f"Enhancing {len(index_files)} directories...", total=len(index_files))
-
-            for db_path in index_files:
-                try:
-                    store = DirIndexStore(db_path)
-                    store.initialize()
-
-                    # Get files to process
-                    if force:
-                        files_to_process = store.list_files()
-                    else:
-                        files_to_process = store.get_files_without_semantic()
-
-                    if not files_to_process:
-                        progress.update(task, advance=1)
-                        continue
-
-                    # Process files
-                    for file_entry in files_to_process:
-                        try:
-                            # Read file content
-                            with open(file_entry.full_path, "r", encoding="utf-8", errors="ignore") as f:
-                                content = f.read()
-
-                            # Generate metadata
-                            metadata = enhancer.enhance_file(
-                                path=str(file_entry.full_path),
-                                content=content,
-                                language=file_entry.language or "unknown"
-                            )
-
-                            # Store metadata
-                            store.add_semantic_metadata(
-                                file_id=file_entry.id,
-                                summary=metadata.summary,
-                                keywords=metadata.keywords,
-                                purpose=metadata.purpose,
-                                llm_tool=tool
-                            )
-
-                            total_processed += 1
-
-                        except Exception as e:
-                            total_errors += 1
-                            if verbose:
-                                console.print(f"[yellow]Error processing {file_entry.full_path}: {e}[/yellow]")
-
-                    store.close()
-
-                except Exception as e:
-                    total_errors += 1
-                    if verbose:
-                        console.print(f"[yellow]Error processing {db_path}: {e}[/yellow]")
-
-                progress.update(task, advance=1)
-
-        result = {
-            "path": str(base_path),
-            "tool": tool,
-            "files_processed": total_processed,
-            "errors": total_errors,
-        }
-
-        if json_mode:
-            print_json(success=True, result=result)
-        else:
-            console.print(f"[green]Enhanced {total_processed} files using {tool}[/green]")
-            if total_errors > 0:
-                console.print(f"  [yellow]Errors: {total_errors}[/yellow]")
-
-    except StorageError as exc:
-        if json_mode:
-            print_json(success=False, error=f"Storage error: {exc}")
-        else:
-            console.print(f"[red]Enhancement failed (storage):[/red] {exc}")
-            raise typer.Exit(code=1)
-    except PermissionError as exc:
-        if json_mode:
-            print_json(success=False, error=f"Permission denied: {exc}")
-        else:
-            console.print(f"[red]Enhancement failed (permission denied):[/red] {exc}")
-            raise typer.Exit(code=1)
-    except CodexLensError as exc:
-        if json_mode:
-            print_json(success=False, error=str(exc))
-        else:
-            console.print(f"[red]Enhancement failed:[/red] {exc}")
-            raise typer.Exit(code=1)
-    except Exception as exc:
-        if json_mode:
-            print_json(success=False, error=f"Unexpected error: {exc}")
-        else:
-            console.print(f"[red]Enhancement failed (unexpected):[/red] {exc}")
-            raise typer.Exit(code=1)
-    finally:
-        if registry is not None:
-            registry.close()
-
@app.command()
 def clean(
    path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."),
--- a/codex-lens/src/codexlens/semantic/init.py
+++ b/codex-lens/src/codexlens/semantic/init.py
@@ -32,38 +32,8 @@ def check_semantic_available() -> tuple[bool, str | None]:
    """Check if semantic search dependencies are available."""
    return SEMANTIC_AVAILABLE, _import_error

-# Export LLM enhancement classes
-try:
-    from .llm_enhancer import (
-        LLMEnhancer,
-        LLMConfig,
-        SemanticMetadata,
-        FileData,
-        EnhancedSemanticIndexer,
-        create_enhancer,
-        create_enhanced_indexer,
-    )
-    LLM_AVAILABLE = True
-except ImportError:
-    LLM_AVAILABLE = False
-    LLMEnhancer = None  # type: ignore
-    LLMConfig = None  # type: ignore
-    SemanticMetadata = None  # type: ignore
-    FileData = None  # type: ignore
-    EnhancedSemanticIndexer = None  # type: ignore
-    create_enhancer = None  # type: ignore
-    create_enhanced_indexer = None  # type: ignore
-
 __all__ = [
    "SEMANTIC_AVAILABLE",
    "SEMANTIC_BACKEND",
    "check_semantic_available",
-    "LLM_AVAILABLE",
-    "LLMEnhancer",
-    "LLMConfig",
-    "SemanticMetadata",
-    "FileData",
-    "EnhancedSemanticIndexer",
-    "create_enhancer",
-    "create_enhanced_indexer",
 ]
--- a/codex-lens/src/codexlens/semantic/llm_enhancer.py
+++ b/codex-lens/src/codexlens/semantic/llm_enhancer.py
@@ -1,899 +0,0 @@
-"""LLM-based semantic enhancement using CCW CLI.
-
-This module provides LLM-generated descriptions that are then embedded
-by fastembed for improved semantic search. The flow is:
-
-    Code → LLM Summary → fastembed embedding → VectorStore → semantic search
-
-LLM-generated summaries match natural language queries better than raw code.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import subprocess
-import shutil
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Dict, List, Optional, TYPE_CHECKING
-
-from codexlens.entities import SemanticChunk, Symbol
-
-if TYPE_CHECKING:
-    from .embedder import Embedder
-    from .vector_store import VectorStore
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class SemanticMetadata:
-    """LLM-generated semantic metadata for a file or symbol."""
-
-    summary: str
-    keywords: List[str]
-    purpose: str
-    file_path: Optional[str] = None
-    symbol_name: Optional[str] = None
-    llm_tool: Optional[str] = None
-
-
-@dataclass
-class FileData:
-    """File data for LLM processing."""
-
-    path: str
-    content: str
-    language: str
-    symbols: List[Symbol] = field(default_factory=list)
-
-
-@dataclass
-class LLMConfig:
-    """Configuration for LLM enhancement.
-
-    Tool selection can be overridden via environment variables:
-    - CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini)
-    - CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen)
-    """
-
-    tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini"))
-    fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen"))
-    timeout_ms: int = 300000
-    batch_size: int = 5
-    max_content_chars: int = 8000  # Max chars per file in batch prompt
-    enabled: bool = True
-
-
-class LLMEnhancer:
-    """LLM-based semantic enhancement using CCW CLI.
-
-    Generates code summaries and search keywords by calling
-    external LLM tools (gemini, qwen) via CCW CLI subprocess.
-    """
-
-    CHUNK_REFINEMENT_PROMPT = '''PURPOSE: Identify optimal semantic split points in code chunk
-TASK:
- Analyze the code structure to find natural semantic boundaries
- Identify logical groupings (functions, classes, related statements)
- Suggest split points that maintain semantic cohesion
-MODE: analysis
-EXPECTED: JSON format with split positions
-
-=== CODE CHUNK ===
-{code_chunk}
-
-=== OUTPUT FORMAT ===
-Return ONLY valid JSON (no markdown, no explanation):
-{{
-  "split_points": [
-    {{
-      "line": <line_number>,
-      "reason": "brief reason for split (e.g., 'start of new function', 'end of class definition')"
-    }}
-  ]
-}}
-
-Rules:
- Split at function/class/method boundaries
- Keep related code together (don't split mid-function)
- Aim for chunks between 500-2000 characters
- Return empty split_points if no good splits found'''
-
-    PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files
-TASK:
- For each code block, generate a concise summary (1-2 sentences)
- Extract 5-10 relevant search keywords
- Identify the functional purpose/category
-MODE: analysis
-EXPECTED: JSON format output
-
-=== CODE BLOCKS ===
-{code_blocks}
-
-=== OUTPUT FORMAT ===
-Return ONLY valid JSON (no markdown, no explanation):
-{{
-  "files": {{
-    "<file_path>": {{
-      "summary": "Brief description of what this code does",
-      "keywords": ["keyword1", "keyword2", ...],
-      "purpose": "category like: auth, api, util, ui, data, config, test"
-    }}
-  }}
-}}'''
-
-    def __init__(self, config: LLMConfig | None = None) -> None:
-        """Initialize LLM enhancer.
-
-        Args:
-            config: LLM configuration, uses defaults if None
-        """
-        self.config = config or LLMConfig()
-        self._ccw_available: Optional[bool] = None
-
-    def check_available(self) -> bool:
-        """Check if CCW CLI tool is available."""
-        if self._ccw_available is not None:
-            return self._ccw_available
-
-        self._ccw_available = shutil.which("ccw") is not None
-        if not self._ccw_available:
-            logger.warning("CCW CLI not found in PATH, LLM enhancement disabled")
-        return self._ccw_available
-
-    def enhance_files(
-        self,
-        files: List[FileData],
-        working_dir: Optional[Path] = None,
-    ) -> Dict[str, SemanticMetadata]:
-        """Enhance multiple files with LLM-generated semantic metadata.
-
-        Processes files in batches to manage token limits and API costs.
-
-        Args:
-            files: List of file data to process
-            working_dir: Optional working directory for CCW CLI
-
-        Returns:
-            Dict mapping file paths to SemanticMetadata
-        """
-        if not self.config.enabled:
-            logger.debug("LLM enhancement disabled by config")
-            return {}
-
-        if not self.check_available():
-            return {}
-
-        if not files:
-            return {}
-
-        results: Dict[str, SemanticMetadata] = {}
-        batch_size = self.config.batch_size
-
-        for i in range(0, len(files), batch_size):
-            batch = files[i:i + batch_size]
-            try:
-                batch_results = self._process_batch(batch, working_dir)
-                results.update(batch_results)
-                logger.debug(
-                    "Processed batch %d/%d: %d files enhanced",
-                    i // batch_size + 1,
-                    (len(files) + batch_size - 1) // batch_size,
-                    len(batch_results),
-                )
-            except Exception as e:
-                logger.warning(
-                    "Batch %d failed, continuing: %s",
-                    i // batch_size + 1,
-                    e,
-                )
-                continue
-
-        return results
-
-    def enhance_file(
-
-        self,
-
-        path: str,
-
-        content: str,
-
-        language: str,
-
-        working_dir: Optional[Path] = None,
-
-    ) -> SemanticMetadata:
-
-        """Enhance a single file with LLM-generated semantic metadata.
-
-
-
-        Convenience method that wraps enhance_files for single file processing.
-
-
-
-        Args:
-
-            path: File path
-
-            content: File content
-
-            language: Programming language
-
-            working_dir: Optional working directory for CCW CLI
-
-
-
-        Returns:
-
-            SemanticMetadata for the file
-
-
-
-        Raises:
-
-            ValueError: If enhancement fails
-
-        """
-
-        file_data = FileData(path=path, content=content, language=language)
-
-        results = self.enhance_files([file_data], working_dir)
-
-
-
-        if path not in results:
-
-            # Return default metadata if enhancement failed
-
-            return SemanticMetadata(
-
-                summary=f"Code file written in {language}",
-
-                keywords=[language, "code"],
-
-                purpose="unknown",
-
-                file_path=path,
-
-                llm_tool=self.config.tool,
-
-            )
-
-
-
-        return results[path]
-
-    def refine_chunk_boundaries(
-        self,
-        chunk: SemanticChunk,
-        max_chunk_size: int = 2000,
-        working_dir: Optional[Path] = None,
-    ) -> List[SemanticChunk]:
-        """Refine chunk boundaries using LLM for large code chunks.
-
-        Uses LLM to identify semantic split points in large chunks,
-        breaking them into smaller, more cohesive pieces.
-
-        Args:
-            chunk: Original chunk to refine
-            max_chunk_size: Maximum characters before triggering refinement
-            working_dir: Optional working directory for CCW CLI
-
-        Returns:
-            List of refined chunks (original chunk if no splits or refinement fails)
-        """
-        # Skip if chunk is small enough
-        if len(chunk.content) <= max_chunk_size:
-            return [chunk]
-
-        # Skip if LLM enhancement disabled or unavailable
-        if not self.config.enabled or not self.check_available():
-            return [chunk]
-
-        # Skip docstring chunks - only refine code chunks
-        if chunk.metadata.get("chunk_type") == "docstring":
-            return [chunk]
-
-        try:
-            # Build refinement prompt
-            prompt = self.CHUNK_REFINEMENT_PROMPT.format(code_chunk=chunk.content)
-
-            # Invoke LLM
-            result = self._invoke_ccw_cli(
-                prompt,
-                tool=self.config.tool,
-                working_dir=working_dir,
-            )
-
-            # Fallback if primary tool fails
-            if not result["success"] and self.config.fallback_tool:
-                result = self._invoke_ccw_cli(
-                    prompt,
-                    tool=self.config.fallback_tool,
-                    working_dir=working_dir,
-                )
-
-            if not result["success"]:
-                logger.debug("LLM refinement failed, returning original chunk")
-                return [chunk]
-
-            # Parse split points
-            split_points = self._parse_split_points(result["stdout"])
-            if not split_points:
-                logger.debug("No split points identified, returning original chunk")
-                return [chunk]
-
-            # Split chunk at identified boundaries
-            refined_chunks = self._split_chunk_at_points(chunk, split_points)
-            logger.debug(
-                "Refined chunk into %d smaller chunks (was %d chars)",
-                len(refined_chunks),
-                len(chunk.content),
-            )
-            return refined_chunks
-
-        except Exception as e:
-            logger.warning("Chunk refinement error: %s, returning original chunk", e)
-            return [chunk]
-
-    def _parse_split_points(self, stdout: str) -> List[int]:
-        """Parse split points from LLM response.
-
-        Args:
-            stdout: Raw stdout from CCW CLI
-
-        Returns:
-            List of line numbers where splits should occur (sorted)
-        """
-        # Extract JSON from response
-        json_str = self._extract_json(stdout)
-        if not json_str:
-            return []
-
-        try:
-            data = json.loads(json_str)
-            split_points_data = data.get("split_points", [])
-
-            # Extract line numbers
-            lines = []
-            for point in split_points_data:
-                if isinstance(point, dict) and "line" in point:
-                    line_num = point["line"]
-                    if isinstance(line_num, int) and line_num > 0:
-                        lines.append(line_num)
-
-            return sorted(set(lines))
-
-        except (json.JSONDecodeError, ValueError, TypeError) as e:
-            logger.debug("Failed to parse split points: %s", e)
-            return []
-
-    def _split_chunk_at_points(
-        self,
-        chunk: SemanticChunk,
-        split_points: List[int],
-    ) -> List[SemanticChunk]:
-        """Split chunk at specified line numbers.
-
-        Args:
-            chunk: Original chunk to split
-            split_points: Sorted list of line numbers to split at
-
-        Returns:
-            List of smaller chunks
-        """
-        lines = chunk.content.splitlines(keepends=True)
-        chunks: List[SemanticChunk] = []
-
-        # Get original metadata
-        base_metadata = dict(chunk.metadata)
-        original_start = base_metadata.get("start_line", 1)
-
-        # Add start and end boundaries
-        boundaries = [0] + split_points + [len(lines)]
-
-        for i in range(len(boundaries) - 1):
-            start_idx = boundaries[i]
-            end_idx = boundaries[i + 1]
-
-            # Skip empty sections
-            if start_idx >= end_idx:
-                continue
-
-            # Extract content
-            section_lines = lines[start_idx:end_idx]
-            section_content = "".join(section_lines)
-
-            # Skip if too small
-            if len(section_content.strip()) < 50:
-                continue
-
-            # Create new chunk with updated metadata
-            new_metadata = base_metadata.copy()
-            new_metadata["start_line"] = original_start + start_idx
-            new_metadata["end_line"] = original_start + end_idx - 1
-            new_metadata["refined_by_llm"] = True
-            new_metadata["original_chunk_size"] = len(chunk.content)
-
-            chunks.append(
-                SemanticChunk(
-                    content=section_content,
-                    embedding=None,  # Embeddings will be regenerated
-                    metadata=new_metadata,
-                )
-            )
-
-        # If no valid chunks created, return original
-        if not chunks:
-            return [chunk]
-
-        return chunks
-
-
-
-
-    def _process_batch(
-        self,
-        files: List[FileData],
-        working_dir: Optional[Path] = None,
-    ) -> Dict[str, SemanticMetadata]:
-        """Process a single batch of files."""
-        prompt = self._build_batch_prompt(files)
-
-        # Try primary tool first
-        result = self._invoke_ccw_cli(
-            prompt,
-            tool=self.config.tool,
-            working_dir=working_dir,
-        )
-
-        # Fallback to secondary tool if primary fails
-        if not result["success"] and self.config.fallback_tool:
-            logger.debug(
-                "Primary tool %s failed, trying fallback %s",
-                self.config.tool,
-                self.config.fallback_tool,
-            )
-            result = self._invoke_ccw_cli(
-                prompt,
-                tool=self.config.fallback_tool,
-                working_dir=working_dir,
-            )
-
-        if not result["success"]:
-            logger.warning("LLM call failed: %s", result.get("stderr", "unknown error"))
-            return {}
-
-        return self._parse_response(result["stdout"], self.config.tool)
-
-    def _build_batch_prompt(self, files: List[FileData]) -> str:
-        """Build prompt for batch processing."""
-        code_blocks_parts: List[str] = []
-
-        for file_data in files:
-            # Truncate content if too long
-            content = file_data.content
-            if len(content) > self.config.max_content_chars:
-                content = content[:self.config.max_content_chars] + "\n... [truncated]"
-
-            # Format code block
-            lang_hint = file_data.language or "text"
-            code_block = f'''[FILE: {file_data.path}]
-```{lang_hint}
-{content}
-```'''
-            code_blocks_parts.append(code_block)
-
-        code_blocks = "\n\n".join(code_blocks_parts)
-        return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks)
-
-    def _invoke_ccw_cli(
-        self,
-        prompt: str,
-        tool: str = "gemini",
-        working_dir: Optional[Path] = None,
-    ) -> Dict[str, Any]:
-        """Invoke CCW CLI tool via subprocess.
-
-        Args:
-            prompt: The prompt to send to LLM
-            tool: Tool name (gemini, qwen, codex)
-            working_dir: Optional working directory
-
-        Returns:
-            Dict with success, stdout, stderr, exit_code
-        """
-        import sys
-        import os
-
-        timeout_seconds = (self.config.timeout_ms / 1000) + 30
-
-        # Build base arguments
-        base_args = [
-            "cli", "exec",
-            prompt,  # Direct string argument
-            "--tool", tool,
-            "--mode", "analysis",
-            "--timeout", str(self.config.timeout_ms),
-        ]
-        if working_dir:
-            base_args.extend(["--cd", str(working_dir)])
-
-        try:
-            if sys.platform == "win32":
-                # On Windows, ccw is a .CMD wrapper that requires shell
-                # Instead, directly invoke node with the ccw.js script
-                ccw_path = shutil.which("ccw")
-                if ccw_path and ccw_path.lower().endswith(".cmd"):
-                    # Find the ccw.js script location
-                    npm_dir = Path(ccw_path).parent
-                    ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js"
-                    if ccw_js.exists():
-                        cmd = ["node", str(ccw_js)] + base_args
-                    else:
-                        # Fallback to shell execution
-                        cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args)
-                        result = subprocess.run(
-                            cmd_str, shell=True, capture_output=True, text=True,
-                            timeout=timeout_seconds, cwd=working_dir,
-                            encoding="utf-8", errors="replace",
-                        )
-                        return {
-                            "success": result.returncode == 0,
-                            "stdout": result.stdout,
-                            "stderr": result.stderr,
-                            "exit_code": result.returncode,
-                        }
-                else:
-                    cmd = ["ccw"] + base_args
-            else:
-                cmd = ["ccw"] + base_args
-
-            result = subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                timeout=timeout_seconds,
-                cwd=working_dir,
-                encoding="utf-8",
-                errors="replace",
-            )
-
-            return {
-                "success": result.returncode == 0,
-                "stdout": result.stdout,
-                "stderr": result.stderr,
-                "exit_code": result.returncode,
-            }
-
-        except subprocess.TimeoutExpired:
-            logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000)
-            return {
-                "success": False,
-                "stdout": "",
-                "stderr": "timeout",
-                "exit_code": -1,
-            }
-        except FileNotFoundError:
-            logger.warning("CCW CLI not found - ensure 'ccw' is in PATH")
-            return {
-                "success": False,
-                "stdout": "",
-                "stderr": "ccw command not found",
-                "exit_code": -1,
-            }
-        except Exception as e:
-            logger.warning("CCW CLI invocation failed: %s", e)
-            return {
-                "success": False,
-                "stdout": "",
-                "stderr": str(e),
-                "exit_code": -1,
-            }
-
-    def _parse_response(
-        self,
-        stdout: str,
-        tool: str,
-    ) -> Dict[str, SemanticMetadata]:
-        """Parse LLM response into SemanticMetadata objects.
-
-        Args:
-            stdout: Raw stdout from CCW CLI
-            tool: Tool name used for generation
-
-        Returns:
-            Dict mapping file paths to SemanticMetadata
-        """
-        results: Dict[str, SemanticMetadata] = {}
-
-        # Extract JSON from response (may be wrapped in markdown or other text)
-        json_str = self._extract_json(stdout)
-        if not json_str:
-            logger.warning("No JSON found in LLM response")
-            return results
-
-        try:
-            data = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            logger.warning("Failed to parse LLM response JSON: %s", e)
-            return results
-
-        # Handle expected format: {"files": {"path": {...}}}
-        files_data = data.get("files", data)
-        if not isinstance(files_data, dict):
-            logger.warning("Unexpected response format: expected dict")
-            return results
-
-        for file_path, metadata in files_data.items():
-            if not isinstance(metadata, dict):
-                continue
-
-            try:
-                results[file_path] = SemanticMetadata(
-                    summary=metadata.get("summary", ""),
-                    keywords=metadata.get("keywords", []),
-                    purpose=metadata.get("purpose", ""),
-                    file_path=file_path,
-                    llm_tool=tool,
-                )
-            except Exception as e:
-                logger.debug("Failed to parse metadata for %s: %s", file_path, e)
-                continue
-
-        return results
-
-    def _extract_json(self, text: str) -> Optional[str]:
-        """Extract JSON object from text that may contain markdown or other content."""
-        # Try to find JSON object boundaries
-        text = text.strip()
-
-        # Remove markdown code blocks if present
-        if text.startswith("```"):
-            lines = text.split("\n")
-            # Remove first line (```json or ```)
-            lines = lines[1:]
-            # Find closing ```
-            for i, line in enumerate(lines):
-                if line.strip() == "```":
-                    lines = lines[:i]
-                    break
-            text = "\n".join(lines)
-
-        # Find JSON object
-        start = text.find("{")
-        if start == -1:
-            return None
-
-        # Find matching closing brace
-        depth = 0
-        end = start
-        for i, char in enumerate(text[start:], start):
-            if char == "{":
-                depth += 1
-            elif char == "}":
-                depth -= 1
-                if depth == 0:
-                    end = i + 1
-                    break
-
-        if depth != 0:
-            return None
-
-        return text[start:end]
-
-
-def create_enhancer(
-    tool: str = "gemini",
-    timeout_ms: int = 300000,
-    batch_size: int = 5,
-    enabled: bool = True,
-) -> LLMEnhancer:
-    """Factory function to create LLM enhancer with custom config."""
-    config = LLMConfig(
-        tool=tool,
-        timeout_ms=timeout_ms,
-        batch_size=batch_size,
-        enabled=enabled,
-    )
-    return LLMEnhancer(config)
-
-
-class EnhancedSemanticIndexer:
-    """Integrates LLM enhancement with fastembed vector search.
-
-    Flow:
-        1. Code files → LLM generates summaries/keywords
-        2. Summaries → fastembed generates embeddings
-        3. Embeddings → VectorStore for similarity search
-
-    This produces better semantic search because:
-    - LLM summaries are natural language descriptions
-    - Natural language queries match summaries better than raw code
-    - Keywords expand search coverage
-    """
-
-    def __init__(
-        self,
-        enhancer: LLMEnhancer,
-        embedder: "Embedder",
-        vector_store: "VectorStore",
-    ) -> None:
-        """Initialize enhanced semantic indexer.
-
-        Args:
-            enhancer: LLM enhancer for generating summaries
-            embedder: Fastembed embedder for vector generation
-            vector_store: Vector storage for similarity search
-        """
-        self.enhancer = enhancer
-        self.embedder = embedder
-        self.vector_store = vector_store
-
-    def index_files(
-        self,
-        files: List[FileData],
-        working_dir: Optional[Path] = None,
-    ) -> int:
-        """Index files with LLM-enhanced semantic search.
-
-        Args:
-            files: List of file data to index
-            working_dir: Optional working directory for LLM calls
-
-        Returns:
-            Number of files successfully indexed
-        """
-        if not files:
-            return 0
-
-        # Step 1: Generate LLM summaries
-        logger.info("Generating LLM summaries for %d files...", len(files))
-        metadata_map = self.enhancer.enhance_files(files, working_dir)
-
-        if not metadata_map:
-            logger.warning("No LLM metadata generated, falling back to raw code")
-            return self._index_raw_code(files)
-
-        # Step 2: Create semantic chunks from LLM summaries
-        chunks_to_embed: List[SemanticChunk] = []
-        file_paths: List[str] = []
-
-        for file_data in files:
-            metadata = metadata_map.get(file_data.path)
-            if metadata:
-                # Use LLM-generated summary + keywords for embedding
-                embeddable_text = self._create_embeddable_text(metadata, file_data)
-                chunk = SemanticChunk(
-                    content=embeddable_text,
-                    embedding=None,
-                    metadata={
-                        "file": file_data.path,
-                        "language": file_data.language,
-                        "summary": metadata.summary,
-                        "keywords": metadata.keywords,
-                        "purpose": metadata.purpose,
-                        "llm_tool": metadata.llm_tool,
-                        "strategy": "llm_enhanced",
-                    },
-                )
-            else:
-                # Fallback: use truncated raw code
-                chunk = SemanticChunk(
-                    content=file_data.content[:2000],
-                    embedding=None,
-                    metadata={
-                        "file": file_data.path,
-                        "language": file_data.language,
-                        "strategy": "raw_code",
-                    },
-                )
-
-            chunks_to_embed.append(chunk)
-            file_paths.append(file_data.path)
-
-        # Step 3: Generate embeddings
-        logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed))
-        texts = [chunk.content for chunk in chunks_to_embed]
-        embeddings = self.embedder.embed(texts)
-
-        # Step 4: Store in vector store
-        indexed_count = 0
-        for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths):
-            chunk.embedding = embedding
-            try:
-                self.vector_store.add_chunk(chunk, file_path)
-                indexed_count += 1
-            except Exception as e:
-                logger.debug("Failed to store chunk for %s: %s", file_path, e)
-
-        logger.info("Successfully indexed %d/%d files", indexed_count, len(files))
-        return indexed_count
-
-    def _create_embeddable_text(
-        self,
-        metadata: SemanticMetadata,
-        file_data: FileData,
-    ) -> str:
-        """Create text optimized for embedding from LLM metadata.
-
-        Combines summary, keywords, and purpose into a single string
-        that will produce good semantic matches for natural language queries.
-        """
-        parts = []
-
-        # Summary is the primary content
-        if metadata.summary:
-            parts.append(metadata.summary)
-
-        # Purpose adds categorical context
-        if metadata.purpose:
-            parts.append(f"Category: {metadata.purpose}")
-
-        # Keywords expand search coverage
-        if metadata.keywords:
-            parts.append(f"Keywords: {', '.join(metadata.keywords)}")
-
-        # Add file name for context
-        parts.append(f"File: {Path(file_data.path).name}")
-
-        return "\n".join(parts)
-
-    def _index_raw_code(self, files: List[FileData]) -> int:
-        """Fallback: index raw code without LLM enhancement."""
-        indexed_count = 0
-
-        for file_data in files:
-            # Truncate to reasonable size
-            content = file_data.content[:2000]
-
-            chunk = SemanticChunk(
-                content=content,
-                embedding=None,
-                metadata={
-                    "file": file_data.path,
-                    "language": file_data.language,
-                    "strategy": "raw_code",
-                },
-            )
-
-            try:
-                embedding = self.embedder.embed_single(content)
-                chunk.embedding = embedding
-                self.vector_store.add_chunk(chunk, file_data.path)
-                indexed_count += 1
-            except Exception as e:
-                logger.debug("Failed to index %s: %s", file_data.path, e)
-
-        return indexed_count
-
-
-def create_enhanced_indexer(
-    vector_store_path: Path,
-    llm_tool: str = "gemini",
-    llm_enabled: bool = True,
-) -> EnhancedSemanticIndexer:
-    """Factory function to create an enhanced semantic indexer.
-
-    Args:
-        vector_store_path: Path for the vector store database
-        llm_tool: LLM tool to use (gemini, qwen)
-        llm_enabled: Whether to enable LLM enhancement
-
-    Returns:
-        Configured EnhancedSemanticIndexer instance
-    """
-    from .embedder import Embedder
-    from .vector_store import VectorStore
-
-    enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled)
-    embedder = Embedder()
-    vector_store = VectorStore(vector_store_path)
-
-    return EnhancedSemanticIndexer(enhancer, embedder, vector_store)