mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
Add comprehensive tests for vector/semantic search functionality
- Implement full coverage tests for Embedder model loading and embedding generation - Add CRUD operations and caching tests for VectorStore - Include cosine similarity computation tests - Validate semantic search accuracy and relevance through various queries - Establish performance benchmarks for embedding and search operations - Ensure edge cases and error handling are covered - Test thread safety and concurrent access scenarios - Verify availability of semantic search dependencies
This commit is contained in:
@@ -1098,3 +1098,132 @@ def clean(
|
||||
else:
|
||||
console.print(f"[red]Clean failed (unexpected):[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command("semantic-list")
|
||||
def semantic_list(
|
||||
path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),
|
||||
offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."),
|
||||
limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."),
|
||||
tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||
) -> None:
|
||||
"""List semantic metadata entries for indexed files.
|
||||
|
||||
Shows files that have LLM-generated summaries and keywords.
|
||||
Results are aggregated from all index databases in the project.
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
base_path = path.expanduser().resolve()
|
||||
|
||||
registry: Optional[RegistryStore] = None
|
||||
try:
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
|
||||
project_info = registry.find_project(base_path)
|
||||
if not project_info:
|
||||
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
|
||||
|
||||
index_dir = mapper.source_to_index_dir(base_path)
|
||||
if not index_dir.exists():
|
||||
raise CodexLensError(f"Index directory not found: {index_dir}")
|
||||
|
||||
all_results: list = []
|
||||
total_count = 0
|
||||
|
||||
index_files = sorted(index_dir.rglob("_index.db"))
|
||||
|
||||
for db_path in index_files:
|
||||
try:
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
results, count = store.list_semantic_metadata(
|
||||
offset=0,
|
||||
limit=1000,
|
||||
llm_tool=tool_filter,
|
||||
)
|
||||
|
||||
source_dir = mapper.index_to_source(db_path.parent)
|
||||
for r in results:
|
||||
r["source_dir"] = str(source_dir)
|
||||
|
||||
all_results.extend(results)
|
||||
total_count += count
|
||||
|
||||
store.close()
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]")
|
||||
|
||||
all_results.sort(key=lambda x: x["generated_at"], reverse=True)
|
||||
paginated = all_results[offset : offset + limit]
|
||||
|
||||
result = {
|
||||
"path": str(base_path),
|
||||
"total": total_count,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"count": len(paginated),
|
||||
"entries": paginated,
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result=result)
|
||||
else:
|
||||
if not paginated:
|
||||
console.print("[yellow]No semantic metadata found.[/yellow]")
|
||||
console.print("Run 'codex-lens enhance' to generate metadata for indexed files.")
|
||||
else:
|
||||
table = Table(title=f"Semantic Metadata ({total_count} total)")
|
||||
table.add_column("File", style="cyan", max_width=40)
|
||||
table.add_column("Language", style="dim")
|
||||
table.add_column("Purpose", max_width=30)
|
||||
table.add_column("Keywords", max_width=25)
|
||||
table.add_column("Tool")
|
||||
|
||||
for entry in paginated:
|
||||
keywords_str = ", ".join(entry["keywords"][:3])
|
||||
if len(entry["keywords"]) > 3:
|
||||
keywords_str += f" (+{len(entry['keywords']) - 3})"
|
||||
|
||||
table.add_row(
|
||||
entry["file_name"],
|
||||
entry["language"] or "-",
|
||||
(entry["purpose"] or "-")[:30],
|
||||
keywords_str or "-",
|
||||
entry["llm_tool"] or "-",
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
if total_count > len(paginated):
|
||||
console.print(
|
||||
f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. "
|
||||
"Use --offset and --limit for pagination.[/dim]"
|
||||
)
|
||||
|
||||
except StorageError as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Storage error: {exc}")
|
||||
else:
|
||||
console.print(f"[red]Semantic-list failed (storage):[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
except CodexLensError as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Semantic-list failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Unexpected error: {exc}")
|
||||
else:
|
||||
console.print(f"[red]Semantic-list failed (unexpected):[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
finally:
|
||||
if registry is not None:
|
||||
registry.close()
|
||||
|
||||
@@ -78,6 +78,11 @@ class Config:
|
||||
}
|
||||
)
|
||||
|
||||
llm_enabled: bool = False
|
||||
llm_tool: str = "gemini"
|
||||
llm_timeout_ms: int = 300000
|
||||
llm_batch_size: int = 5
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
try:
|
||||
self.data_dir = self.data_dir.expanduser().resolve()
|
||||
|
||||
@@ -30,6 +30,7 @@ class SearchOptions:
|
||||
total_limit: Total result limit across all directories
|
||||
include_symbols: Whether to include symbol search results
|
||||
files_only: Return only file paths without excerpts
|
||||
include_semantic: Whether to include semantic keyword search results
|
||||
"""
|
||||
depth: int = -1
|
||||
max_workers: int = 8
|
||||
@@ -37,6 +38,7 @@ class SearchOptions:
|
||||
total_limit: int = 100
|
||||
include_symbols: bool = False
|
||||
files_only: bool = False
|
||||
include_semantic: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -378,7 +380,8 @@ class ChainSearchEngine:
|
||||
idx_path,
|
||||
query,
|
||||
options.limit_per_dir,
|
||||
options.files_only
|
||||
options.files_only,
|
||||
options.include_semantic
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
}
|
||||
@@ -400,7 +403,8 @@ class ChainSearchEngine:
|
||||
def _search_single_index(self, index_path: Path,
|
||||
query: str,
|
||||
limit: int,
|
||||
files_only: bool = False) -> List[SearchResult]:
|
||||
files_only: bool = False,
|
||||
include_semantic: bool = False) -> List[SearchResult]:
|
||||
"""Search a single index database.
|
||||
|
||||
Handles exceptions gracefully, returning empty list on failure.
|
||||
@@ -410,18 +414,40 @@ class ChainSearchEngine:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results from this index
|
||||
files_only: If True, skip snippet generation for faster search
|
||||
include_semantic: If True, also search semantic keywords and merge results
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects (empty on error)
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
# Get FTS results
|
||||
if files_only:
|
||||
# Fast path: return paths only without snippets
|
||||
paths = store.search_files_only(query, limit=limit)
|
||||
return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
else:
|
||||
return store.search_fts(query, limit=limit)
|
||||
fts_results = store.search_fts(query, limit=limit)
|
||||
|
||||
# Optionally add semantic keyword results
|
||||
if include_semantic:
|
||||
try:
|
||||
semantic_matches = store.search_semantic_keywords(query)
|
||||
# Convert semantic matches to SearchResult with 0.8x weight
|
||||
for file_entry, keywords in semantic_matches:
|
||||
# Create excerpt from keywords
|
||||
excerpt = f"Keywords: {', '.join(keywords[:5])}"
|
||||
# Use a base score of 10.0 for semantic matches, weighted by 0.8
|
||||
semantic_result = SearchResult(
|
||||
path=str(file_entry.full_path),
|
||||
score=10.0 * 0.8,
|
||||
excerpt=excerpt
|
||||
)
|
||||
fts_results.append(semantic_result)
|
||||
except Exception as sem_exc:
|
||||
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
|
||||
|
||||
return fts_results
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Search error in {index_path}: {exc}")
|
||||
return []
|
||||
|
||||
@@ -32,4 +32,38 @@ def check_semantic_available() -> tuple[bool, str | None]:
|
||||
"""Check if semantic search dependencies are available."""
|
||||
return SEMANTIC_AVAILABLE, _import_error
|
||||
|
||||
__all__ = ["SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available"]
|
||||
# Export LLM enhancement classes
|
||||
try:
|
||||
from .llm_enhancer import (
|
||||
LLMEnhancer,
|
||||
LLMConfig,
|
||||
SemanticMetadata,
|
||||
FileData,
|
||||
EnhancedSemanticIndexer,
|
||||
create_enhancer,
|
||||
create_enhanced_indexer,
|
||||
)
|
||||
LLM_AVAILABLE = True
|
||||
except ImportError:
|
||||
LLM_AVAILABLE = False
|
||||
LLMEnhancer = None # type: ignore
|
||||
LLMConfig = None # type: ignore
|
||||
SemanticMetadata = None # type: ignore
|
||||
FileData = None # type: ignore
|
||||
EnhancedSemanticIndexer = None # type: ignore
|
||||
create_enhancer = None # type: ignore
|
||||
create_enhanced_indexer = None # type: ignore
|
||||
|
||||
__all__ = [
|
||||
"SEMANTIC_AVAILABLE",
|
||||
"SEMANTIC_BACKEND",
|
||||
"check_semantic_available",
|
||||
"LLM_AVAILABLE",
|
||||
"LLMEnhancer",
|
||||
"LLMConfig",
|
||||
"SemanticMetadata",
|
||||
"FileData",
|
||||
"EnhancedSemanticIndexer",
|
||||
"create_enhancer",
|
||||
"create_enhanced_indexer",
|
||||
]
|
||||
|
||||
667
codex-lens/src/codexlens/semantic/llm_enhancer.py
Normal file
667
codex-lens/src/codexlens/semantic/llm_enhancer.py
Normal file
@@ -0,0 +1,667 @@
|
||||
"""LLM-based semantic enhancement using CCW CLI.
|
||||
|
||||
This module provides LLM-generated descriptions that are then embedded
|
||||
by fastembed for improved semantic search. The flow is:
|
||||
|
||||
Code → LLM Summary → fastembed embedding → VectorStore → semantic search
|
||||
|
||||
LLM-generated summaries match natural language queries better than raw code.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
||||
|
||||
from codexlens.entities import SemanticChunk, Symbol
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .embedder import Embedder
|
||||
from .vector_store import VectorStore
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SemanticMetadata:
|
||||
"""LLM-generated semantic metadata for a file or symbol."""
|
||||
|
||||
summary: str
|
||||
keywords: List[str]
|
||||
purpose: str
|
||||
file_path: Optional[str] = None
|
||||
symbol_name: Optional[str] = None
|
||||
llm_tool: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileData:
|
||||
"""File data for LLM processing."""
|
||||
|
||||
path: str
|
||||
content: str
|
||||
language: str
|
||||
symbols: List[Symbol] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMConfig:
|
||||
"""Configuration for LLM enhancement.
|
||||
|
||||
Tool selection can be overridden via environment variables:
|
||||
- CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini)
|
||||
- CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen)
|
||||
"""
|
||||
|
||||
tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini"))
|
||||
fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen"))
|
||||
timeout_ms: int = 300000
|
||||
batch_size: int = 5
|
||||
max_content_chars: int = 8000 # Max chars per file in batch prompt
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class LLMEnhancer:
|
||||
"""LLM-based semantic enhancement using CCW CLI.
|
||||
|
||||
Generates code summaries and search keywords by calling
|
||||
external LLM tools (gemini, qwen) via CCW CLI subprocess.
|
||||
"""
|
||||
|
||||
PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files
|
||||
TASK:
|
||||
- For each code block, generate a concise summary (1-2 sentences)
|
||||
- Extract 5-10 relevant search keywords
|
||||
- Identify the functional purpose/category
|
||||
MODE: analysis
|
||||
EXPECTED: JSON format output
|
||||
|
||||
=== CODE BLOCKS ===
|
||||
{code_blocks}
|
||||
|
||||
=== OUTPUT FORMAT ===
|
||||
Return ONLY valid JSON (no markdown, no explanation):
|
||||
{{
|
||||
"files": {{
|
||||
"<file_path>": {{
|
||||
"summary": "Brief description of what this code does",
|
||||
"keywords": ["keyword1", "keyword2", ...],
|
||||
"purpose": "category like: auth, api, util, ui, data, config, test"
|
||||
}}
|
||||
}}
|
||||
}}'''
|
||||
|
||||
def __init__(self, config: LLMConfig | None = None) -> None:
|
||||
"""Initialize LLM enhancer.
|
||||
|
||||
Args:
|
||||
config: LLM configuration, uses defaults if None
|
||||
"""
|
||||
self.config = config or LLMConfig()
|
||||
self._ccw_available: Optional[bool] = None
|
||||
|
||||
def check_available(self) -> bool:
|
||||
"""Check if CCW CLI tool is available."""
|
||||
if self._ccw_available is not None:
|
||||
return self._ccw_available
|
||||
|
||||
self._ccw_available = shutil.which("ccw") is not None
|
||||
if not self._ccw_available:
|
||||
logger.warning("CCW CLI not found in PATH, LLM enhancement disabled")
|
||||
return self._ccw_available
|
||||
|
||||
def enhance_files(
|
||||
self,
|
||||
files: List[FileData],
|
||||
working_dir: Optional[Path] = None,
|
||||
) -> Dict[str, SemanticMetadata]:
|
||||
"""Enhance multiple files with LLM-generated semantic metadata.
|
||||
|
||||
Processes files in batches to manage token limits and API costs.
|
||||
|
||||
Args:
|
||||
files: List of file data to process
|
||||
working_dir: Optional working directory for CCW CLI
|
||||
|
||||
Returns:
|
||||
Dict mapping file paths to SemanticMetadata
|
||||
"""
|
||||
if not self.config.enabled:
|
||||
logger.debug("LLM enhancement disabled by config")
|
||||
return {}
|
||||
|
||||
if not self.check_available():
|
||||
return {}
|
||||
|
||||
if not files:
|
||||
return {}
|
||||
|
||||
results: Dict[str, SemanticMetadata] = {}
|
||||
batch_size = self.config.batch_size
|
||||
|
||||
for i in range(0, len(files), batch_size):
|
||||
batch = files[i:i + batch_size]
|
||||
try:
|
||||
batch_results = self._process_batch(batch, working_dir)
|
||||
results.update(batch_results)
|
||||
logger.debug(
|
||||
"Processed batch %d/%d: %d files enhanced",
|
||||
i // batch_size + 1,
|
||||
(len(files) + batch_size - 1) // batch_size,
|
||||
len(batch_results),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Batch %d failed, continuing: %s",
|
||||
i // batch_size + 1,
|
||||
e,
|
||||
)
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
def enhance_file(
|
||||
self,
|
||||
path: str,
|
||||
content: str,
|
||||
language: str,
|
||||
working_dir: Optional[Path] = None,
|
||||
) -> SemanticMetadata:
|
||||
"""Enhance a single file with LLM-generated semantic metadata.
|
||||
|
||||
Convenience method that wraps enhance_files for single file processing.
|
||||
|
||||
Args:
|
||||
path: File path
|
||||
content: File content
|
||||
language: Programming language
|
||||
working_dir: Optional working directory for CCW CLI
|
||||
|
||||
Returns:
|
||||
SemanticMetadata for the file
|
||||
|
||||
Raises:
|
||||
ValueError: If enhancement fails
|
||||
"""
|
||||
file_data = FileData(path=path, content=content, language=language)
|
||||
results = self.enhance_files([file_data], working_dir)
|
||||
|
||||
if path not in results:
|
||||
# Return default metadata if enhancement failed
|
||||
return SemanticMetadata(
|
||||
summary=f"Code file written in {language}",
|
||||
keywords=[language, "code"],
|
||||
purpose="unknown",
|
||||
file_path=path,
|
||||
llm_tool=self.config.tool,
|
||||
)
|
||||
|
||||
return results[path]
|
||||
|
||||
|
||||
def _process_batch(
|
||||
self,
|
||||
files: List[FileData],
|
||||
working_dir: Optional[Path] = None,
|
||||
) -> Dict[str, SemanticMetadata]:
|
||||
"""Process a single batch of files."""
|
||||
prompt = self._build_batch_prompt(files)
|
||||
|
||||
# Try primary tool first
|
||||
result = self._invoke_ccw_cli(
|
||||
prompt,
|
||||
tool=self.config.tool,
|
||||
working_dir=working_dir,
|
||||
)
|
||||
|
||||
# Fallback to secondary tool if primary fails
|
||||
if not result["success"] and self.config.fallback_tool:
|
||||
logger.debug(
|
||||
"Primary tool %s failed, trying fallback %s",
|
||||
self.config.tool,
|
||||
self.config.fallback_tool,
|
||||
)
|
||||
result = self._invoke_ccw_cli(
|
||||
prompt,
|
||||
tool=self.config.fallback_tool,
|
||||
working_dir=working_dir,
|
||||
)
|
||||
|
||||
if not result["success"]:
|
||||
logger.warning("LLM call failed: %s", result.get("stderr", "unknown error"))
|
||||
return {}
|
||||
|
||||
return self._parse_response(result["stdout"], self.config.tool)
|
||||
|
||||
def _build_batch_prompt(self, files: List[FileData]) -> str:
|
||||
"""Build prompt for batch processing."""
|
||||
code_blocks_parts: List[str] = []
|
||||
|
||||
for file_data in files:
|
||||
# Truncate content if too long
|
||||
content = file_data.content
|
||||
if len(content) > self.config.max_content_chars:
|
||||
content = content[:self.config.max_content_chars] + "\n... [truncated]"
|
||||
|
||||
# Format code block
|
||||
lang_hint = file_data.language or "text"
|
||||
code_block = f'''[FILE: {file_data.path}]
|
||||
```{lang_hint}
|
||||
{content}
|
||||
```'''
|
||||
code_blocks_parts.append(code_block)
|
||||
|
||||
code_blocks = "\n\n".join(code_blocks_parts)
|
||||
return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks)
|
||||
|
||||
def _invoke_ccw_cli(
|
||||
self,
|
||||
prompt: str,
|
||||
tool: str = "gemini",
|
||||
working_dir: Optional[Path] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Invoke CCW CLI tool via subprocess.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to send to LLM
|
||||
tool: Tool name (gemini, qwen, codex)
|
||||
working_dir: Optional working directory
|
||||
|
||||
Returns:
|
||||
Dict with success, stdout, stderr, exit_code
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
|
||||
timeout_seconds = (self.config.timeout_ms / 1000) + 30
|
||||
|
||||
# Build base arguments
|
||||
base_args = [
|
||||
"cli", "exec",
|
||||
prompt, # Direct string argument
|
||||
"--tool", tool,
|
||||
"--mode", "analysis",
|
||||
"--timeout", str(self.config.timeout_ms),
|
||||
]
|
||||
if working_dir:
|
||||
base_args.extend(["--cd", str(working_dir)])
|
||||
|
||||
try:
|
||||
if sys.platform == "win32":
|
||||
# On Windows, ccw is a .CMD wrapper that requires shell
|
||||
# Instead, directly invoke node with the ccw.js script
|
||||
ccw_path = shutil.which("ccw")
|
||||
if ccw_path and ccw_path.lower().endswith(".cmd"):
|
||||
# Find the ccw.js script location
|
||||
npm_dir = Path(ccw_path).parent
|
||||
ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js"
|
||||
if ccw_js.exists():
|
||||
cmd = ["node", str(ccw_js)] + base_args
|
||||
else:
|
||||
# Fallback to shell execution
|
||||
cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args)
|
||||
result = subprocess.run(
|
||||
cmd_str, shell=True, capture_output=True, text=True,
|
||||
timeout=timeout_seconds, cwd=working_dir,
|
||||
encoding="utf-8", errors="replace",
|
||||
)
|
||||
return {
|
||||
"success": result.returncode == 0,
|
||||
"stdout": result.stdout,
|
||||
"stderr": result.stderr,
|
||||
"exit_code": result.returncode,
|
||||
}
|
||||
else:
|
||||
cmd = ["ccw"] + base_args
|
||||
else:
|
||||
cmd = ["ccw"] + base_args
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
cwd=working_dir,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
)
|
||||
|
||||
return {
|
||||
"success": result.returncode == 0,
|
||||
"stdout": result.stdout,
|
||||
"stderr": result.stderr,
|
||||
"exit_code": result.returncode,
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000)
|
||||
return {
|
||||
"success": False,
|
||||
"stdout": "",
|
||||
"stderr": "timeout",
|
||||
"exit_code": -1,
|
||||
}
|
||||
except FileNotFoundError:
|
||||
logger.warning("CCW CLI not found - ensure 'ccw' is in PATH")
|
||||
return {
|
||||
"success": False,
|
||||
"stdout": "",
|
||||
"stderr": "ccw command not found",
|
||||
"exit_code": -1,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("CCW CLI invocation failed: %s", e)
|
||||
return {
|
||||
"success": False,
|
||||
"stdout": "",
|
||||
"stderr": str(e),
|
||||
"exit_code": -1,
|
||||
}
|
||||
|
||||
def _parse_response(
|
||||
self,
|
||||
stdout: str,
|
||||
tool: str,
|
||||
) -> Dict[str, SemanticMetadata]:
|
||||
"""Parse LLM response into SemanticMetadata objects.
|
||||
|
||||
Args:
|
||||
stdout: Raw stdout from CCW CLI
|
||||
tool: Tool name used for generation
|
||||
|
||||
Returns:
|
||||
Dict mapping file paths to SemanticMetadata
|
||||
"""
|
||||
results: Dict[str, SemanticMetadata] = {}
|
||||
|
||||
# Extract JSON from response (may be wrapped in markdown or other text)
|
||||
json_str = self._extract_json(stdout)
|
||||
if not json_str:
|
||||
logger.warning("No JSON found in LLM response")
|
||||
return results
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse LLM response JSON: %s", e)
|
||||
return results
|
||||
|
||||
# Handle expected format: {"files": {"path": {...}}}
|
||||
files_data = data.get("files", data)
|
||||
if not isinstance(files_data, dict):
|
||||
logger.warning("Unexpected response format: expected dict")
|
||||
return results
|
||||
|
||||
for file_path, metadata in files_data.items():
|
||||
if not isinstance(metadata, dict):
|
||||
continue
|
||||
|
||||
try:
|
||||
results[file_path] = SemanticMetadata(
|
||||
summary=metadata.get("summary", ""),
|
||||
keywords=metadata.get("keywords", []),
|
||||
purpose=metadata.get("purpose", ""),
|
||||
file_path=file_path,
|
||||
llm_tool=tool,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to parse metadata for %s: %s", file_path, e)
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
def _extract_json(self, text: str) -> Optional[str]:
|
||||
"""Extract JSON object from text that may contain markdown or other content."""
|
||||
# Try to find JSON object boundaries
|
||||
text = text.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
# Remove first line (```json or ```)
|
||||
lines = lines[1:]
|
||||
# Find closing ```
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip() == "```":
|
||||
lines = lines[:i]
|
||||
break
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Find JSON object
|
||||
start = text.find("{")
|
||||
if start == -1:
|
||||
return None
|
||||
|
||||
# Find matching closing brace
|
||||
depth = 0
|
||||
end = start
|
||||
for i, char in enumerate(text[start:], start):
|
||||
if char == "{":
|
||||
depth += 1
|
||||
elif char == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
|
||||
if depth != 0:
|
||||
return None
|
||||
|
||||
return text[start:end]
|
||||
|
||||
|
||||
def create_enhancer(
|
||||
tool: str = "gemini",
|
||||
timeout_ms: int = 300000,
|
||||
batch_size: int = 5,
|
||||
enabled: bool = True,
|
||||
) -> LLMEnhancer:
|
||||
"""Factory function to create LLM enhancer with custom config."""
|
||||
config = LLMConfig(
|
||||
tool=tool,
|
||||
timeout_ms=timeout_ms,
|
||||
batch_size=batch_size,
|
||||
enabled=enabled,
|
||||
)
|
||||
return LLMEnhancer(config)
|
||||
|
||||
|
||||
class EnhancedSemanticIndexer:
|
||||
"""Integrates LLM enhancement with fastembed vector search.
|
||||
|
||||
Flow:
|
||||
1. Code files → LLM generates summaries/keywords
|
||||
2. Summaries → fastembed generates embeddings
|
||||
3. Embeddings → VectorStore for similarity search
|
||||
|
||||
This produces better semantic search because:
|
||||
- LLM summaries are natural language descriptions
|
||||
- Natural language queries match summaries better than raw code
|
||||
- Keywords expand search coverage
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enhancer: LLMEnhancer,
|
||||
embedder: "Embedder",
|
||||
vector_store: "VectorStore",
|
||||
) -> None:
|
||||
"""Initialize enhanced semantic indexer.
|
||||
|
||||
Args:
|
||||
enhancer: LLM enhancer for generating summaries
|
||||
embedder: Fastembed embedder for vector generation
|
||||
vector_store: Vector storage for similarity search
|
||||
"""
|
||||
self.enhancer = enhancer
|
||||
self.embedder = embedder
|
||||
self.vector_store = vector_store
|
||||
|
||||
def index_files(
|
||||
self,
|
||||
files: List[FileData],
|
||||
working_dir: Optional[Path] = None,
|
||||
) -> int:
|
||||
"""Index files with LLM-enhanced semantic search.
|
||||
|
||||
Args:
|
||||
files: List of file data to index
|
||||
working_dir: Optional working directory for LLM calls
|
||||
|
||||
Returns:
|
||||
Number of files successfully indexed
|
||||
"""
|
||||
if not files:
|
||||
return 0
|
||||
|
||||
# Step 1: Generate LLM summaries
|
||||
logger.info("Generating LLM summaries for %d files...", len(files))
|
||||
metadata_map = self.enhancer.enhance_files(files, working_dir)
|
||||
|
||||
if not metadata_map:
|
||||
logger.warning("No LLM metadata generated, falling back to raw code")
|
||||
return self._index_raw_code(files)
|
||||
|
||||
# Step 2: Create semantic chunks from LLM summaries
|
||||
chunks_to_embed: List[SemanticChunk] = []
|
||||
file_paths: List[str] = []
|
||||
|
||||
for file_data in files:
|
||||
metadata = metadata_map.get(file_data.path)
|
||||
if metadata:
|
||||
# Use LLM-generated summary + keywords for embedding
|
||||
embeddable_text = self._create_embeddable_text(metadata, file_data)
|
||||
chunk = SemanticChunk(
|
||||
content=embeddable_text,
|
||||
embedding=None,
|
||||
metadata={
|
||||
"file": file_data.path,
|
||||
"language": file_data.language,
|
||||
"summary": metadata.summary,
|
||||
"keywords": metadata.keywords,
|
||||
"purpose": metadata.purpose,
|
||||
"llm_tool": metadata.llm_tool,
|
||||
"strategy": "llm_enhanced",
|
||||
},
|
||||
)
|
||||
else:
|
||||
# Fallback: use truncated raw code
|
||||
chunk = SemanticChunk(
|
||||
content=file_data.content[:2000],
|
||||
embedding=None,
|
||||
metadata={
|
||||
"file": file_data.path,
|
||||
"language": file_data.language,
|
||||
"strategy": "raw_code",
|
||||
},
|
||||
)
|
||||
|
||||
chunks_to_embed.append(chunk)
|
||||
file_paths.append(file_data.path)
|
||||
|
||||
# Step 3: Generate embeddings
|
||||
logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed))
|
||||
texts = [chunk.content for chunk in chunks_to_embed]
|
||||
embeddings = self.embedder.embed(texts)
|
||||
|
||||
# Step 4: Store in vector store
|
||||
indexed_count = 0
|
||||
for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths):
|
||||
chunk.embedding = embedding
|
||||
try:
|
||||
self.vector_store.add_chunk(chunk, file_path)
|
||||
indexed_count += 1
|
||||
except Exception as e:
|
||||
logger.debug("Failed to store chunk for %s: %s", file_path, e)
|
||||
|
||||
logger.info("Successfully indexed %d/%d files", indexed_count, len(files))
|
||||
return indexed_count
|
||||
|
||||
def _create_embeddable_text(
|
||||
self,
|
||||
metadata: SemanticMetadata,
|
||||
file_data: FileData,
|
||||
) -> str:
|
||||
"""Create text optimized for embedding from LLM metadata.
|
||||
|
||||
Combines summary, keywords, and purpose into a single string
|
||||
that will produce good semantic matches for natural language queries.
|
||||
"""
|
||||
parts = []
|
||||
|
||||
# Summary is the primary content
|
||||
if metadata.summary:
|
||||
parts.append(metadata.summary)
|
||||
|
||||
# Purpose adds categorical context
|
||||
if metadata.purpose:
|
||||
parts.append(f"Category: {metadata.purpose}")
|
||||
|
||||
# Keywords expand search coverage
|
||||
if metadata.keywords:
|
||||
parts.append(f"Keywords: {', '.join(metadata.keywords)}")
|
||||
|
||||
# Add file name for context
|
||||
parts.append(f"File: {Path(file_data.path).name}")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _index_raw_code(self, files: List[FileData]) -> int:
|
||||
"""Fallback: index raw code without LLM enhancement."""
|
||||
indexed_count = 0
|
||||
|
||||
for file_data in files:
|
||||
# Truncate to reasonable size
|
||||
content = file_data.content[:2000]
|
||||
|
||||
chunk = SemanticChunk(
|
||||
content=content,
|
||||
embedding=None,
|
||||
metadata={
|
||||
"file": file_data.path,
|
||||
"language": file_data.language,
|
||||
"strategy": "raw_code",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
embedding = self.embedder.embed_single(content)
|
||||
chunk.embedding = embedding
|
||||
self.vector_store.add_chunk(chunk, file_data.path)
|
||||
indexed_count += 1
|
||||
except Exception as e:
|
||||
logger.debug("Failed to index %s: %s", file_data.path, e)
|
||||
|
||||
return indexed_count
|
||||
|
||||
|
||||
def create_enhanced_indexer(
|
||||
vector_store_path: Path,
|
||||
llm_tool: str = "gemini",
|
||||
llm_enabled: bool = True,
|
||||
) -> EnhancedSemanticIndexer:
|
||||
"""Factory function to create an enhanced semantic indexer.
|
||||
|
||||
Args:
|
||||
vector_store_path: Path for the vector store database
|
||||
llm_tool: LLM tool to use (gemini, qwen)
|
||||
llm_enabled: Whether to enable LLM enhancement
|
||||
|
||||
Returns:
|
||||
Configured EnhancedSemanticIndexer instance
|
||||
"""
|
||||
from .embedder import Embedder
|
||||
from .vector_store import VectorStore
|
||||
|
||||
enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled)
|
||||
embedder = Embedder()
|
||||
vector_store = VectorStore(vector_store_path)
|
||||
|
||||
return EnhancedSemanticIndexer(enhancer, embedder, vector_store)
|
||||
@@ -347,6 +347,222 @@ class DirIndexStore:
|
||||
row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
|
||||
return int(row["c"]) if row else 0
|
||||
|
||||
# === Semantic Metadata ===
|
||||
|
||||
def add_semantic_metadata(
|
||||
self,
|
||||
file_id: int,
|
||||
summary: str,
|
||||
keywords: List[str],
|
||||
purpose: str,
|
||||
llm_tool: str
|
||||
) -> None:
|
||||
"""Add or update semantic metadata for a file.
|
||||
|
||||
Args:
|
||||
file_id: File ID from files table
|
||||
summary: LLM-generated summary
|
||||
keywords: List of keywords
|
||||
purpose: Purpose/role of the file
|
||||
llm_tool: Tool used to generate metadata (gemini/qwen)
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
keywords_json = json.dumps(keywords)
|
||||
generated_at = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
summary=excluded.summary,
|
||||
keywords=excluded.keywords,
|
||||
purpose=excluded.purpose,
|
||||
llm_tool=excluded.llm_tool,
|
||||
generated_at=excluded.generated_at
|
||||
""",
|
||||
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
|
||||
"""Get semantic metadata for a file.
|
||||
|
||||
Args:
|
||||
file_id: File ID from files table
|
||||
|
||||
Returns:
|
||||
Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT summary, keywords, purpose, llm_tool, generated_at
|
||||
FROM semantic_metadata WHERE file_id=?
|
||||
""",
|
||||
(file_id,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
import json
|
||||
|
||||
return {
|
||||
"summary": row["summary"],
|
||||
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
|
||||
"purpose": row["purpose"],
|
||||
"llm_tool": row["llm_tool"],
|
||||
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
|
||||
}
|
||||
|
||||
def get_files_without_semantic(self) -> List[FileEntry]:
|
||||
"""Get all files that don't have semantic metadata.
|
||||
|
||||
Returns:
|
||||
List of FileEntry objects without semantic metadata
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
|
||||
FROM files f
|
||||
LEFT JOIN semantic_metadata sm ON f.id = sm.file_id
|
||||
WHERE sm.id IS NULL
|
||||
ORDER BY f.name
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
FileEntry(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
full_path=Path(row["full_path"]),
|
||||
language=row["language"],
|
||||
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
|
||||
line_count=int(row["line_count"]) if row["line_count"] else 0,
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def search_semantic_keywords(self, keyword: str) -> List[Tuple[FileEntry, List[str]]]:
|
||||
"""Search files by semantic keywords.
|
||||
|
||||
Args:
|
||||
keyword: Keyword to search for (case-insensitive)
|
||||
|
||||
Returns:
|
||||
List of (FileEntry, keywords) tuples where keyword matches
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
keyword_pattern = f"%{keyword}%"
|
||||
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
|
||||
FROM files f
|
||||
JOIN semantic_metadata sm ON f.id = sm.file_id
|
||||
WHERE sm.keywords LIKE ? COLLATE NOCASE
|
||||
ORDER BY f.name
|
||||
""",
|
||||
(keyword_pattern,),
|
||||
).fetchall()
|
||||
|
||||
import json
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
file_entry = FileEntry(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
full_path=Path(row["full_path"]),
|
||||
language=row["language"],
|
||||
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
|
||||
line_count=int(row["line_count"]) if row["line_count"] else 0,
|
||||
)
|
||||
keywords = json.loads(row["keywords"]) if row["keywords"] else []
|
||||
results.append((file_entry, keywords))
|
||||
|
||||
return results
|
||||
|
||||
def list_semantic_metadata(
|
||||
self,
|
||||
offset: int = 0,
|
||||
limit: int = 50,
|
||||
llm_tool: Optional[str] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], int]:
|
||||
"""List all semantic metadata with file information.
|
||||
|
||||
Args:
|
||||
offset: Number of records to skip (for pagination)
|
||||
limit: Maximum records to return (max 100)
|
||||
llm_tool: Optional filter by LLM tool used
|
||||
|
||||
Returns:
|
||||
Tuple of (list of metadata dicts, total count)
|
||||
"""
|
||||
import json
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
base_query = """
|
||||
SELECT f.id as file_id, f.name as file_name, f.full_path,
|
||||
f.language, f.line_count,
|
||||
sm.summary, sm.keywords, sm.purpose,
|
||||
sm.llm_tool, sm.generated_at
|
||||
FROM files f
|
||||
JOIN semantic_metadata sm ON f.id = sm.file_id
|
||||
"""
|
||||
count_query = """
|
||||
SELECT COUNT(*) as total
|
||||
FROM files f
|
||||
JOIN semantic_metadata sm ON f.id = sm.file_id
|
||||
"""
|
||||
|
||||
params: List[Any] = []
|
||||
if llm_tool:
|
||||
base_query += " WHERE sm.llm_tool = ?"
|
||||
count_query += " WHERE sm.llm_tool = ?"
|
||||
params.append(llm_tool)
|
||||
|
||||
base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?"
|
||||
params.extend([min(limit, 100), offset])
|
||||
|
||||
count_params = [llm_tool] if llm_tool else []
|
||||
total_row = conn.execute(count_query, count_params).fetchone()
|
||||
total = int(total_row["total"]) if total_row else 0
|
||||
|
||||
rows = conn.execute(base_query, params).fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
results.append({
|
||||
"file_id": int(row["file_id"]),
|
||||
"file_name": row["file_name"],
|
||||
"full_path": row["full_path"],
|
||||
"language": row["language"],
|
||||
"line_count": int(row["line_count"]) if row["line_count"] else 0,
|
||||
"summary": row["summary"],
|
||||
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
|
||||
"purpose": row["purpose"],
|
||||
"llm_tool": row["llm_tool"],
|
||||
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
|
||||
})
|
||||
|
||||
return results, total
|
||||
|
||||
# === Subdirectory Links ===
|
||||
|
||||
def register_subdir(
|
||||
@@ -748,12 +964,28 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# Semantic metadata table
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS semantic_metadata (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
|
||||
summary TEXT,
|
||||
keywords TEXT,
|
||||
purpose TEXT,
|
||||
llm_tool TEXT,
|
||||
generated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Indexes
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to create schema: {exc}") from exc
|
||||
|
||||
Reference in New Issue
Block a user