Add comprehensive tests for vector/semantic search functionality

- Implement full coverage tests for Embedder model loading and embedding generation - Add CRUD operations and caching tests for VectorStore - Include cosine similarity computation tests - Validate semantic search accuracy and relevance through various queries - Establish performance benchmarks for embedding and search operations - Ensure edge cases and error handling are covered - Test thread safety and concurrent access scenarios - Verify availability of semantic search dependencies
2026-02-11 02:33:51 +08:00 · 2025-12-14 17:17:09 +08:00
parent 8d542b8e45
commit 79a2953862
47 changed files with 11208 additions and 4336 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -1098,3 +1098,132 @@ def clean(
        else:
            console.print(f"[red]Clean failed (unexpected):[/red] {exc}")
            raise typer.Exit(code=1)
+
+
+@app.command("semantic-list")
+def semantic_list(
+    path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),
+    offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."),
+    limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."),
+    tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
+) -> None:
+    """List semantic metadata entries for indexed files.
+
+    Shows files that have LLM-generated summaries and keywords.
+    Results are aggregated from all index databases in the project.
+    """
+    _configure_logging(verbose)
+    base_path = path.expanduser().resolve()
+
+    registry: Optional[RegistryStore] = None
+    try:
+        registry = RegistryStore()
+        registry.initialize()
+        mapper = PathMapper()
+
+        project_info = registry.find_project(base_path)
+        if not project_info:
+            raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
+
+        index_dir = mapper.source_to_index_dir(base_path)
+        if not index_dir.exists():
+            raise CodexLensError(f"Index directory not found: {index_dir}")
+
+        all_results: list = []
+        total_count = 0
+
+        index_files = sorted(index_dir.rglob("_index.db"))
+
+        for db_path in index_files:
+            try:
+                store = DirIndexStore(db_path)
+                store.initialize()
+
+                results, count = store.list_semantic_metadata(
+                    offset=0,
+                    limit=1000,
+                    llm_tool=tool_filter,
+                )
+
+                source_dir = mapper.index_to_source(db_path.parent)
+                for r in results:
+                    r["source_dir"] = str(source_dir)
+
+                all_results.extend(results)
+                total_count += count
+
+                store.close()
+            except Exception as e:
+                if verbose:
+                    console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]")
+
+        all_results.sort(key=lambda x: x["generated_at"], reverse=True)
+        paginated = all_results[offset : offset + limit]
+
+        result = {
+            "path": str(base_path),
+            "total": total_count,
+            "offset": offset,
+            "limit": limit,
+            "count": len(paginated),
+            "entries": paginated,
+        }
+
+        if json_mode:
+            print_json(success=True, result=result)
+        else:
+            if not paginated:
+                console.print("[yellow]No semantic metadata found.[/yellow]")
+                console.print("Run 'codex-lens enhance' to generate metadata for indexed files.")
+            else:
+                table = Table(title=f"Semantic Metadata ({total_count} total)")
+                table.add_column("File", style="cyan", max_width=40)
+                table.add_column("Language", style="dim")
+                table.add_column("Purpose", max_width=30)
+                table.add_column("Keywords", max_width=25)
+                table.add_column("Tool")
+
+                for entry in paginated:
+                    keywords_str = ", ".join(entry["keywords"][:3])
+                    if len(entry["keywords"]) > 3:
+                        keywords_str += f" (+{len(entry['keywords']) - 3})"
+
+                    table.add_row(
+                        entry["file_name"],
+                        entry["language"] or "-",
+                        (entry["purpose"] or "-")[:30],
+                        keywords_str or "-",
+                        entry["llm_tool"] or "-",
+                    )
+
+                console.print(table)
+
+                if total_count > len(paginated):
+                    console.print(
+                        f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. "
+                        "Use --offset and --limit for pagination.[/dim]"
+                    )
+
+    except StorageError as exc:
+        if json_mode:
+            print_json(success=False, error=f"Storage error: {exc}")
+        else:
+            console.print(f"[red]Semantic-list failed (storage):[/red] {exc}")
+            raise typer.Exit(code=1)
+    except CodexLensError as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Semantic-list failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=f"Unexpected error: {exc}")
+        else:
+            console.print(f"[red]Semantic-list failed (unexpected):[/red] {exc}")
+            raise typer.Exit(code=1)
+    finally:
+        if registry is not None:
+            registry.close()
--- a/codex-lens/src/codexlens/config.py
+++ b/codex-lens/src/codexlens/config.py
@@ -78,6 +78,11 @@ class Config:
        }
    )

+    llm_enabled: bool = False
+    llm_tool: str = "gemini"
+    llm_timeout_ms: int = 300000
+    llm_batch_size: int = 5
+
    def __post_init__(self) -> None:
        try:
            self.data_dir = self.data_dir.expanduser().resolve()
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -30,6 +30,7 @@ class SearchOptions:
        total_limit: Total result limit across all directories
        include_symbols: Whether to include symbol search results
        files_only: Return only file paths without excerpts
+        include_semantic: Whether to include semantic keyword search results
    """
    depth: int = -1
    max_workers: int = 8
@@ -37,6 +38,7 @@ class SearchOptions:
    total_limit: int = 100
    include_symbols: bool = False
    files_only: bool = False
+    include_semantic: bool = False


@dataclass
@@ -378,7 +380,8 @@ class ChainSearchEngine:
                idx_path,
                query,
                options.limit_per_dir,
-                options.files_only
+                options.files_only,
+                options.include_semantic
            ): idx_path
            for idx_path in index_paths
        }
@@ -400,7 +403,8 @@ class ChainSearchEngine:
    def _search_single_index(self, index_path: Path,
                              query: str,
                              limit: int,
-                              files_only: bool = False) -> List[SearchResult]:
+                              files_only: bool = False,
+                              include_semantic: bool = False) -> List[SearchResult]:
        """Search a single index database.

        Handles exceptions gracefully, returning empty list on failure.
@@ -410,18 +414,40 @@ class ChainSearchEngine:
            query: FTS5 query string
            limit: Maximum results from this index
            files_only: If True, skip snippet generation for faster search
+            include_semantic: If True, also search semantic keywords and merge results

        Returns:
            List of SearchResult objects (empty on error)
        """
        try:
            with DirIndexStore(index_path) as store:
+                # Get FTS results
                if files_only:
                    # Fast path: return paths only without snippets
                    paths = store.search_files_only(query, limit=limit)
-                    return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
+                    fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
                else:
-                    return store.search_fts(query, limit=limit)
+                    fts_results = store.search_fts(query, limit=limit)
+                
+                # Optionally add semantic keyword results
+                if include_semantic:
+                    try:
+                        semantic_matches = store.search_semantic_keywords(query)
+                        # Convert semantic matches to SearchResult with 0.8x weight
+                        for file_entry, keywords in semantic_matches:
+                            # Create excerpt from keywords
+                            excerpt = f"Keywords: {', '.join(keywords[:5])}"
+                            # Use a base score of 10.0 for semantic matches, weighted by 0.8
+                            semantic_result = SearchResult(
+                                path=str(file_entry.full_path),
+                                score=10.0 * 0.8,
+                                excerpt=excerpt
+                            )
+                            fts_results.append(semantic_result)
+                    except Exception as sem_exc:
+                        self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
+                
+                return fts_results
        except Exception as exc:
            self.logger.debug(f"Search error in {index_path}: {exc}")
            return []
--- a/codex-lens/src/codexlens/semantic/init.py
+++ b/codex-lens/src/codexlens/semantic/init.py
@@ -32,4 +32,38 @@ def check_semantic_available() -> tuple[bool, str | None]:
    """Check if semantic search dependencies are available."""
    return SEMANTIC_AVAILABLE, _import_error

-__all__ = ["SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available"]
+# Export LLM enhancement classes
+try:
+    from .llm_enhancer import (
+        LLMEnhancer,
+        LLMConfig,
+        SemanticMetadata,
+        FileData,
+        EnhancedSemanticIndexer,
+        create_enhancer,
+        create_enhanced_indexer,
+    )
+    LLM_AVAILABLE = True
+except ImportError:
+    LLM_AVAILABLE = False
+    LLMEnhancer = None  # type: ignore
+    LLMConfig = None  # type: ignore
+    SemanticMetadata = None  # type: ignore
+    FileData = None  # type: ignore
+    EnhancedSemanticIndexer = None  # type: ignore
+    create_enhancer = None  # type: ignore
+    create_enhanced_indexer = None  # type: ignore
+
+__all__ = [
+    "SEMANTIC_AVAILABLE",
+    "SEMANTIC_BACKEND",
+    "check_semantic_available",
+    "LLM_AVAILABLE",
+    "LLMEnhancer",
+    "LLMConfig",
+    "SemanticMetadata",
+    "FileData",
+    "EnhancedSemanticIndexer",
+    "create_enhancer",
+    "create_enhanced_indexer",
+]
--- a/codex-lens/src/codexlens/semantic/llm_enhancer.py
+++ b/codex-lens/src/codexlens/semantic/llm_enhancer.py
@@ -0,0 +1,667 @@
+"""LLM-based semantic enhancement using CCW CLI.
+
+This module provides LLM-generated descriptions that are then embedded
+by fastembed for improved semantic search. The flow is:
+
+    Code → LLM Summary → fastembed embedding → VectorStore → semantic search
+
+LLM-generated summaries match natural language queries better than raw code.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import subprocess
+import shutil
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
+
+from codexlens.entities import SemanticChunk, Symbol
+
+if TYPE_CHECKING:
+    from .embedder import Embedder
+    from .vector_store import VectorStore
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SemanticMetadata:
+    """LLM-generated semantic metadata for a file or symbol."""
+
+    summary: str
+    keywords: List[str]
+    purpose: str
+    file_path: Optional[str] = None
+    symbol_name: Optional[str] = None
+    llm_tool: Optional[str] = None
+
+
+@dataclass
+class FileData:
+    """File data for LLM processing."""
+
+    path: str
+    content: str
+    language: str
+    symbols: List[Symbol] = field(default_factory=list)
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for LLM enhancement.
+
+    Tool selection can be overridden via environment variables:
+    - CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini)
+    - CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen)
+    """
+
+    tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini"))
+    fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen"))
+    timeout_ms: int = 300000
+    batch_size: int = 5
+    max_content_chars: int = 8000  # Max chars per file in batch prompt
+    enabled: bool = True
+
+
+class LLMEnhancer:
+    """LLM-based semantic enhancement using CCW CLI.
+
+    Generates code summaries and search keywords by calling
+    external LLM tools (gemini, qwen) via CCW CLI subprocess.
+    """
+
+    PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files
+TASK:
+- For each code block, generate a concise summary (1-2 sentences)
+- Extract 5-10 relevant search keywords
+- Identify the functional purpose/category
+MODE: analysis
+EXPECTED: JSON format output
+
+=== CODE BLOCKS ===
+{code_blocks}
+
+=== OUTPUT FORMAT ===
+Return ONLY valid JSON (no markdown, no explanation):
+{{
+  "files": {{
+    "<file_path>": {{
+      "summary": "Brief description of what this code does",
+      "keywords": ["keyword1", "keyword2", ...],
+      "purpose": "category like: auth, api, util, ui, data, config, test"
+    }}
+  }}
+}}'''
+
+    def __init__(self, config: LLMConfig | None = None) -> None:
+        """Initialize LLM enhancer.
+
+        Args:
+            config: LLM configuration, uses defaults if None
+        """
+        self.config = config or LLMConfig()
+        self._ccw_available: Optional[bool] = None
+
+    def check_available(self) -> bool:
+        """Check if CCW CLI tool is available."""
+        if self._ccw_available is not None:
+            return self._ccw_available
+
+        self._ccw_available = shutil.which("ccw") is not None
+        if not self._ccw_available:
+            logger.warning("CCW CLI not found in PATH, LLM enhancement disabled")
+        return self._ccw_available
+
+    def enhance_files(
+        self,
+        files: List[FileData],
+        working_dir: Optional[Path] = None,
+    ) -> Dict[str, SemanticMetadata]:
+        """Enhance multiple files with LLM-generated semantic metadata.
+
+        Processes files in batches to manage token limits and API costs.
+
+        Args:
+            files: List of file data to process
+            working_dir: Optional working directory for CCW CLI
+
+        Returns:
+            Dict mapping file paths to SemanticMetadata
+        """
+        if not self.config.enabled:
+            logger.debug("LLM enhancement disabled by config")
+            return {}
+
+        if not self.check_available():
+            return {}
+
+        if not files:
+            return {}
+
+        results: Dict[str, SemanticMetadata] = {}
+        batch_size = self.config.batch_size
+
+        for i in range(0, len(files), batch_size):
+            batch = files[i:i + batch_size]
+            try:
+                batch_results = self._process_batch(batch, working_dir)
+                results.update(batch_results)
+                logger.debug(
+                    "Processed batch %d/%d: %d files enhanced",
+                    i // batch_size + 1,
+                    (len(files) + batch_size - 1) // batch_size,
+                    len(batch_results),
+                )
+            except Exception as e:
+                logger.warning(
+                    "Batch %d failed, continuing: %s",
+                    i // batch_size + 1,
+                    e,
+                )
+                continue
+
+        return results
+
+    def enhance_file(
+        self,
+        path: str,
+        content: str,
+        language: str,
+        working_dir: Optional[Path] = None,
+    ) -> SemanticMetadata:
+        """Enhance a single file with LLM-generated semantic metadata.
+
+        Convenience method that wraps enhance_files for single file processing.
+
+        Args:
+            path: File path
+            content: File content
+            language: Programming language
+            working_dir: Optional working directory for CCW CLI
+
+        Returns:
+            SemanticMetadata for the file
+
+        Raises:
+            ValueError: If enhancement fails
+        """
+        file_data = FileData(path=path, content=content, language=language)
+        results = self.enhance_files([file_data], working_dir)
+
+        if path not in results:
+            # Return default metadata if enhancement failed
+            return SemanticMetadata(
+                summary=f"Code file written in {language}",
+                keywords=[language, "code"],
+                purpose="unknown",
+                file_path=path,
+                llm_tool=self.config.tool,
+            )
+
+        return results[path]
+
+
+    def _process_batch(
+        self,
+        files: List[FileData],
+        working_dir: Optional[Path] = None,
+    ) -> Dict[str, SemanticMetadata]:
+        """Process a single batch of files."""
+        prompt = self._build_batch_prompt(files)
+
+        # Try primary tool first
+        result = self._invoke_ccw_cli(
+            prompt,
+            tool=self.config.tool,
+            working_dir=working_dir,
+        )
+
+        # Fallback to secondary tool if primary fails
+        if not result["success"] and self.config.fallback_tool:
+            logger.debug(
+                "Primary tool %s failed, trying fallback %s",
+                self.config.tool,
+                self.config.fallback_tool,
+            )
+            result = self._invoke_ccw_cli(
+                prompt,
+                tool=self.config.fallback_tool,
+                working_dir=working_dir,
+            )
+
+        if not result["success"]:
+            logger.warning("LLM call failed: %s", result.get("stderr", "unknown error"))
+            return {}
+
+        return self._parse_response(result["stdout"], self.config.tool)
+
+    def _build_batch_prompt(self, files: List[FileData]) -> str:
+        """Build prompt for batch processing."""
+        code_blocks_parts: List[str] = []
+
+        for file_data in files:
+            # Truncate content if too long
+            content = file_data.content
+            if len(content) > self.config.max_content_chars:
+                content = content[:self.config.max_content_chars] + "\n... [truncated]"
+
+            # Format code block
+            lang_hint = file_data.language or "text"
+            code_block = f'''[FILE: {file_data.path}]
+```{lang_hint}
+{content}
+```'''
+            code_blocks_parts.append(code_block)
+
+        code_blocks = "\n\n".join(code_blocks_parts)
+        return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks)
+
+    def _invoke_ccw_cli(
+        self,
+        prompt: str,
+        tool: str = "gemini",
+        working_dir: Optional[Path] = None,
+    ) -> Dict[str, Any]:
+        """Invoke CCW CLI tool via subprocess.
+
+        Args:
+            prompt: The prompt to send to LLM
+            tool: Tool name (gemini, qwen, codex)
+            working_dir: Optional working directory
+
+        Returns:
+            Dict with success, stdout, stderr, exit_code
+        """
+        import sys
+        import os
+
+        timeout_seconds = (self.config.timeout_ms / 1000) + 30
+
+        # Build base arguments
+        base_args = [
+            "cli", "exec",
+            prompt,  # Direct string argument
+            "--tool", tool,
+            "--mode", "analysis",
+            "--timeout", str(self.config.timeout_ms),
+        ]
+        if working_dir:
+            base_args.extend(["--cd", str(working_dir)])
+
+        try:
+            if sys.platform == "win32":
+                # On Windows, ccw is a .CMD wrapper that requires shell
+                # Instead, directly invoke node with the ccw.js script
+                ccw_path = shutil.which("ccw")
+                if ccw_path and ccw_path.lower().endswith(".cmd"):
+                    # Find the ccw.js script location
+                    npm_dir = Path(ccw_path).parent
+                    ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js"
+                    if ccw_js.exists():
+                        cmd = ["node", str(ccw_js)] + base_args
+                    else:
+                        # Fallback to shell execution
+                        cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args)
+                        result = subprocess.run(
+                            cmd_str, shell=True, capture_output=True, text=True,
+                            timeout=timeout_seconds, cwd=working_dir,
+                            encoding="utf-8", errors="replace",
+                        )
+                        return {
+                            "success": result.returncode == 0,
+                            "stdout": result.stdout,
+                            "stderr": result.stderr,
+                            "exit_code": result.returncode,
+                        }
+                else:
+                    cmd = ["ccw"] + base_args
+            else:
+                cmd = ["ccw"] + base_args
+
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout_seconds,
+                cwd=working_dir,
+                encoding="utf-8",
+                errors="replace",
+            )
+
+            return {
+                "success": result.returncode == 0,
+                "stdout": result.stdout,
+                "stderr": result.stderr,
+                "exit_code": result.returncode,
+            }
+
+        except subprocess.TimeoutExpired:
+            logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000)
+            return {
+                "success": False,
+                "stdout": "",
+                "stderr": "timeout",
+                "exit_code": -1,
+            }
+        except FileNotFoundError:
+            logger.warning("CCW CLI not found - ensure 'ccw' is in PATH")
+            return {
+                "success": False,
+                "stdout": "",
+                "stderr": "ccw command not found",
+                "exit_code": -1,
+            }
+        except Exception as e:
+            logger.warning("CCW CLI invocation failed: %s", e)
+            return {
+                "success": False,
+                "stdout": "",
+                "stderr": str(e),
+                "exit_code": -1,
+            }
+
+    def _parse_response(
+        self,
+        stdout: str,
+        tool: str,
+    ) -> Dict[str, SemanticMetadata]:
+        """Parse LLM response into SemanticMetadata objects.
+
+        Args:
+            stdout: Raw stdout from CCW CLI
+            tool: Tool name used for generation
+
+        Returns:
+            Dict mapping file paths to SemanticMetadata
+        """
+        results: Dict[str, SemanticMetadata] = {}
+
+        # Extract JSON from response (may be wrapped in markdown or other text)
+        json_str = self._extract_json(stdout)
+        if not json_str:
+            logger.warning("No JSON found in LLM response")
+            return results
+
+        try:
+            data = json.loads(json_str)
+        except json.JSONDecodeError as e:
+            logger.warning("Failed to parse LLM response JSON: %s", e)
+            return results
+
+        # Handle expected format: {"files": {"path": {...}}}
+        files_data = data.get("files", data)
+        if not isinstance(files_data, dict):
+            logger.warning("Unexpected response format: expected dict")
+            return results
+
+        for file_path, metadata in files_data.items():
+            if not isinstance(metadata, dict):
+                continue
+
+            try:
+                results[file_path] = SemanticMetadata(
+                    summary=metadata.get("summary", ""),
+                    keywords=metadata.get("keywords", []),
+                    purpose=metadata.get("purpose", ""),
+                    file_path=file_path,
+                    llm_tool=tool,
+                )
+            except Exception as e:
+                logger.debug("Failed to parse metadata for %s: %s", file_path, e)
+                continue
+
+        return results
+
+    def _extract_json(self, text: str) -> Optional[str]:
+        """Extract JSON object from text that may contain markdown or other content."""
+        # Try to find JSON object boundaries
+        text = text.strip()
+
+        # Remove markdown code blocks if present
+        if text.startswith("```"):
+            lines = text.split("\n")
+            # Remove first line (```json or ```)
+            lines = lines[1:]
+            # Find closing ```
+            for i, line in enumerate(lines):
+                if line.strip() == "```":
+                    lines = lines[:i]
+                    break
+            text = "\n".join(lines)
+
+        # Find JSON object
+        start = text.find("{")
+        if start == -1:
+            return None
+
+        # Find matching closing brace
+        depth = 0
+        end = start
+        for i, char in enumerate(text[start:], start):
+            if char == "{":
+                depth += 1
+            elif char == "}":
+                depth -= 1
+                if depth == 0:
+                    end = i + 1
+                    break
+
+        if depth != 0:
+            return None
+
+        return text[start:end]
+
+
+def create_enhancer(
+    tool: str = "gemini",
+    timeout_ms: int = 300000,
+    batch_size: int = 5,
+    enabled: bool = True,
+) -> LLMEnhancer:
+    """Factory function to create LLM enhancer with custom config."""
+    config = LLMConfig(
+        tool=tool,
+        timeout_ms=timeout_ms,
+        batch_size=batch_size,
+        enabled=enabled,
+    )
+    return LLMEnhancer(config)
+
+
+class EnhancedSemanticIndexer:
+    """Integrates LLM enhancement with fastembed vector search.
+
+    Flow:
+        1. Code files → LLM generates summaries/keywords
+        2. Summaries → fastembed generates embeddings
+        3. Embeddings → VectorStore for similarity search
+
+    This produces better semantic search because:
+    - LLM summaries are natural language descriptions
+    - Natural language queries match summaries better than raw code
+    - Keywords expand search coverage
+    """
+
+    def __init__(
+        self,
+        enhancer: LLMEnhancer,
+        embedder: "Embedder",
+        vector_store: "VectorStore",
+    ) -> None:
+        """Initialize enhanced semantic indexer.
+
+        Args:
+            enhancer: LLM enhancer for generating summaries
+            embedder: Fastembed embedder for vector generation
+            vector_store: Vector storage for similarity search
+        """
+        self.enhancer = enhancer
+        self.embedder = embedder
+        self.vector_store = vector_store
+
+    def index_files(
+        self,
+        files: List[FileData],
+        working_dir: Optional[Path] = None,
+    ) -> int:
+        """Index files with LLM-enhanced semantic search.
+
+        Args:
+            files: List of file data to index
+            working_dir: Optional working directory for LLM calls
+
+        Returns:
+            Number of files successfully indexed
+        """
+        if not files:
+            return 0
+
+        # Step 1: Generate LLM summaries
+        logger.info("Generating LLM summaries for %d files...", len(files))
+        metadata_map = self.enhancer.enhance_files(files, working_dir)
+
+        if not metadata_map:
+            logger.warning("No LLM metadata generated, falling back to raw code")
+            return self._index_raw_code(files)
+
+        # Step 2: Create semantic chunks from LLM summaries
+        chunks_to_embed: List[SemanticChunk] = []
+        file_paths: List[str] = []
+
+        for file_data in files:
+            metadata = metadata_map.get(file_data.path)
+            if metadata:
+                # Use LLM-generated summary + keywords for embedding
+                embeddable_text = self._create_embeddable_text(metadata, file_data)
+                chunk = SemanticChunk(
+                    content=embeddable_text,
+                    embedding=None,
+                    metadata={
+                        "file": file_data.path,
+                        "language": file_data.language,
+                        "summary": metadata.summary,
+                        "keywords": metadata.keywords,
+                        "purpose": metadata.purpose,
+                        "llm_tool": metadata.llm_tool,
+                        "strategy": "llm_enhanced",
+                    },
+                )
+            else:
+                # Fallback: use truncated raw code
+                chunk = SemanticChunk(
+                    content=file_data.content[:2000],
+                    embedding=None,
+                    metadata={
+                        "file": file_data.path,
+                        "language": file_data.language,
+                        "strategy": "raw_code",
+                    },
+                )
+
+            chunks_to_embed.append(chunk)
+            file_paths.append(file_data.path)
+
+        # Step 3: Generate embeddings
+        logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed))
+        texts = [chunk.content for chunk in chunks_to_embed]
+        embeddings = self.embedder.embed(texts)
+
+        # Step 4: Store in vector store
+        indexed_count = 0
+        for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths):
+            chunk.embedding = embedding
+            try:
+                self.vector_store.add_chunk(chunk, file_path)
+                indexed_count += 1
+            except Exception as e:
+                logger.debug("Failed to store chunk for %s: %s", file_path, e)
+
+        logger.info("Successfully indexed %d/%d files", indexed_count, len(files))
+        return indexed_count
+
+    def _create_embeddable_text(
+        self,
+        metadata: SemanticMetadata,
+        file_data: FileData,
+    ) -> str:
+        """Create text optimized for embedding from LLM metadata.
+
+        Combines summary, keywords, and purpose into a single string
+        that will produce good semantic matches for natural language queries.
+        """
+        parts = []
+
+        # Summary is the primary content
+        if metadata.summary:
+            parts.append(metadata.summary)
+
+        # Purpose adds categorical context
+        if metadata.purpose:
+            parts.append(f"Category: {metadata.purpose}")
+
+        # Keywords expand search coverage
+        if metadata.keywords:
+            parts.append(f"Keywords: {', '.join(metadata.keywords)}")
+
+        # Add file name for context
+        parts.append(f"File: {Path(file_data.path).name}")
+
+        return "\n".join(parts)
+
+    def _index_raw_code(self, files: List[FileData]) -> int:
+        """Fallback: index raw code without LLM enhancement."""
+        indexed_count = 0
+
+        for file_data in files:
+            # Truncate to reasonable size
+            content = file_data.content[:2000]
+
+            chunk = SemanticChunk(
+                content=content,
+                embedding=None,
+                metadata={
+                    "file": file_data.path,
+                    "language": file_data.language,
+                    "strategy": "raw_code",
+                },
+            )
+
+            try:
+                embedding = self.embedder.embed_single(content)
+                chunk.embedding = embedding
+                self.vector_store.add_chunk(chunk, file_data.path)
+                indexed_count += 1
+            except Exception as e:
+                logger.debug("Failed to index %s: %s", file_data.path, e)
+
+        return indexed_count
+
+
+def create_enhanced_indexer(
+    vector_store_path: Path,
+    llm_tool: str = "gemini",
+    llm_enabled: bool = True,
+) -> EnhancedSemanticIndexer:
+    """Factory function to create an enhanced semantic indexer.
+
+    Args:
+        vector_store_path: Path for the vector store database
+        llm_tool: LLM tool to use (gemini, qwen)
+        llm_enabled: Whether to enable LLM enhancement
+
+    Returns:
+        Configured EnhancedSemanticIndexer instance
+    """
+    from .embedder import Embedder
+    from .vector_store import VectorStore
+
+    enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled)
+    embedder = Embedder()
+    vector_store = VectorStore(vector_store_path)
+
+    return EnhancedSemanticIndexer(enhancer, embedder, vector_store)
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -347,6 +347,222 @@ class DirIndexStore:
            row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
            return int(row["c"]) if row else 0

+    # === Semantic Metadata ===
+
+    def add_semantic_metadata(
+        self,
+        file_id: int,
+        summary: str,
+        keywords: List[str],
+        purpose: str,
+        llm_tool: str
+    ) -> None:
+        """Add or update semantic metadata for a file.
+
+        Args:
+            file_id: File ID from files table
+            summary: LLM-generated summary
+            keywords: List of keywords
+            purpose: Purpose/role of the file
+            llm_tool: Tool used to generate metadata (gemini/qwen)
+        """
+        with self._lock:
+            conn = self._get_connection()
+
+            import json
+            import time
+
+            keywords_json = json.dumps(keywords)
+            generated_at = time.time()
+
+            conn.execute(
+                """
+                INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
+                VALUES(?, ?, ?, ?, ?, ?)
+                ON CONFLICT(file_id) DO UPDATE SET
+                    summary=excluded.summary,
+                    keywords=excluded.keywords,
+                    purpose=excluded.purpose,
+                    llm_tool=excluded.llm_tool,
+                    generated_at=excluded.generated_at
+                """,
+                (file_id, summary, keywords_json, purpose, llm_tool, generated_at),
+            )
+            conn.commit()
+
+    def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
+        """Get semantic metadata for a file.
+
+        Args:
+            file_id: File ID from files table
+
+        Returns:
+            Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found
+        """
+        with self._lock:
+            conn = self._get_connection()
+
+            row = conn.execute(
+                """
+                SELECT summary, keywords, purpose, llm_tool, generated_at
+                FROM semantic_metadata WHERE file_id=?
+                """,
+                (file_id,),
+            ).fetchone()
+
+            if not row:
+                return None
+
+            import json
+
+            return {
+                "summary": row["summary"],
+                "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                "purpose": row["purpose"],
+                "llm_tool": row["llm_tool"],
+                "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
+            }
+
+    def get_files_without_semantic(self) -> List[FileEntry]:
+        """Get all files that don't have semantic metadata.
+
+        Returns:
+            List of FileEntry objects without semantic metadata
+        """
+        with self._lock:
+            conn = self._get_connection()
+
+            rows = conn.execute(
+                """
+                SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
+                FROM files f
+                LEFT JOIN semantic_metadata sm ON f.id = sm.file_id
+                WHERE sm.id IS NULL
+                ORDER BY f.name
+                """
+            ).fetchall()
+
+            return [
+                FileEntry(
+                    id=int(row["id"]),
+                    name=row["name"],
+                    full_path=Path(row["full_path"]),
+                    language=row["language"],
+                    mtime=float(row["mtime"]) if row["mtime"] else 0.0,
+                    line_count=int(row["line_count"]) if row["line_count"] else 0,
+                )
+                for row in rows
+            ]
+
+    def search_semantic_keywords(self, keyword: str) -> List[Tuple[FileEntry, List[str]]]:
+        """Search files by semantic keywords.
+
+        Args:
+            keyword: Keyword to search for (case-insensitive)
+
+        Returns:
+            List of (FileEntry, keywords) tuples where keyword matches
+        """
+        with self._lock:
+            conn = self._get_connection()
+
+            keyword_pattern = f"%{keyword}%"
+
+            rows = conn.execute(
+                """
+                SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
+                FROM files f
+                JOIN semantic_metadata sm ON f.id = sm.file_id
+                WHERE sm.keywords LIKE ? COLLATE NOCASE
+                ORDER BY f.name
+                """,
+                (keyword_pattern,),
+            ).fetchall()
+
+            import json
+
+            results = []
+            for row in rows:
+                file_entry = FileEntry(
+                    id=int(row["id"]),
+                    name=row["name"],
+                    full_path=Path(row["full_path"]),
+                    language=row["language"],
+                    mtime=float(row["mtime"]) if row["mtime"] else 0.0,
+                    line_count=int(row["line_count"]) if row["line_count"] else 0,
+                )
+                keywords = json.loads(row["keywords"]) if row["keywords"] else []
+                results.append((file_entry, keywords))
+
+            return results
+
+    def list_semantic_metadata(
+        self,
+        offset: int = 0,
+        limit: int = 50,
+        llm_tool: Optional[str] = None,
+    ) -> Tuple[List[Dict[str, Any]], int]:
+        """List all semantic metadata with file information.
+
+        Args:
+            offset: Number of records to skip (for pagination)
+            limit: Maximum records to return (max 100)
+            llm_tool: Optional filter by LLM tool used
+
+        Returns:
+            Tuple of (list of metadata dicts, total count)
+        """
+        import json
+
+        with self._lock:
+            conn = self._get_connection()
+
+            base_query = """
+                SELECT f.id as file_id, f.name as file_name, f.full_path,
+                       f.language, f.line_count,
+                       sm.summary, sm.keywords, sm.purpose,
+                       sm.llm_tool, sm.generated_at
+                FROM files f
+                JOIN semantic_metadata sm ON f.id = sm.file_id
+            """
+            count_query = """
+                SELECT COUNT(*) as total
+                FROM files f
+                JOIN semantic_metadata sm ON f.id = sm.file_id
+            """
+
+            params: List[Any] = []
+            if llm_tool:
+                base_query += " WHERE sm.llm_tool = ?"
+                count_query += " WHERE sm.llm_tool = ?"
+                params.append(llm_tool)
+
+            base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?"
+            params.extend([min(limit, 100), offset])
+
+            count_params = [llm_tool] if llm_tool else []
+            total_row = conn.execute(count_query, count_params).fetchone()
+            total = int(total_row["total"]) if total_row else 0
+
+            rows = conn.execute(base_query, params).fetchall()
+
+            results = []
+            for row in rows:
+                results.append({
+                    "file_id": int(row["file_id"]),
+                    "file_name": row["file_name"],
+                    "full_path": row["full_path"],
+                    "language": row["language"],
+                    "line_count": int(row["line_count"]) if row["line_count"] else 0,
+                    "summary": row["summary"],
+                    "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                    "purpose": row["purpose"],
+                    "llm_tool": row["llm_tool"],
+                    "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
+                })
+
+            return results, total
+
    # === Subdirectory Links ===

    def register_subdir(
@@ -748,12 +964,28 @@ class DirIndexStore:
                """
            )

+            # Semantic metadata table
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS semantic_metadata (
+                    id INTEGER PRIMARY KEY,
+                    file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
+                    summary TEXT,
+                    keywords TEXT,
+                    purpose TEXT,
+                    llm_tool TEXT,
+                    generated_at REAL
+                )
+                """
+            )
+
            # Indexes
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")

        except sqlite3.DatabaseError as exc:
            raise StorageError(f"Failed to create schema: {exc}") from exc