Add comprehensive tests for vector/semantic search functionality

- Implement full coverage tests for Embedder model loading and embedding generation
- Add CRUD operations and caching tests for VectorStore
- Include cosine similarity computation tests
- Validate semantic search accuracy and relevance through various queries
- Establish performance benchmarks for embedding and search operations
- Ensure edge cases and error handling are covered
- Test thread safety and concurrent access scenarios
- Verify availability of semantic search dependencies
This commit is contained in:
catlog22
2025-12-14 17:17:09 +08:00
parent 8d542b8e45
commit 79a2953862
47 changed files with 11208 additions and 4336 deletions

View File

@@ -1098,3 +1098,132 @@ def clean(
else:
console.print(f"[red]Clean failed (unexpected):[/red] {exc}")
raise typer.Exit(code=1)
@app.command("semantic-list")
def semantic_list(
path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),
offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."),
limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."),
tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""List semantic metadata entries for indexed files.
Shows files that have LLM-generated summaries and keywords.
Results are aggregated from all index databases in the project.
"""
_configure_logging(verbose)
base_path = path.expanduser().resolve()
registry: Optional[RegistryStore] = None
try:
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
project_info = registry.find_project(base_path)
if not project_info:
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
index_dir = mapper.source_to_index_dir(base_path)
if not index_dir.exists():
raise CodexLensError(f"Index directory not found: {index_dir}")
all_results: list = []
total_count = 0
index_files = sorted(index_dir.rglob("_index.db"))
for db_path in index_files:
try:
store = DirIndexStore(db_path)
store.initialize()
results, count = store.list_semantic_metadata(
offset=0,
limit=1000,
llm_tool=tool_filter,
)
source_dir = mapper.index_to_source(db_path.parent)
for r in results:
r["source_dir"] = str(source_dir)
all_results.extend(results)
total_count += count
store.close()
except Exception as e:
if verbose:
console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]")
all_results.sort(key=lambda x: x["generated_at"], reverse=True)
paginated = all_results[offset : offset + limit]
result = {
"path": str(base_path),
"total": total_count,
"offset": offset,
"limit": limit,
"count": len(paginated),
"entries": paginated,
}
if json_mode:
print_json(success=True, result=result)
else:
if not paginated:
console.print("[yellow]No semantic metadata found.[/yellow]")
console.print("Run 'codex-lens enhance' to generate metadata for indexed files.")
else:
table = Table(title=f"Semantic Metadata ({total_count} total)")
table.add_column("File", style="cyan", max_width=40)
table.add_column("Language", style="dim")
table.add_column("Purpose", max_width=30)
table.add_column("Keywords", max_width=25)
table.add_column("Tool")
for entry in paginated:
keywords_str = ", ".join(entry["keywords"][:3])
if len(entry["keywords"]) > 3:
keywords_str += f" (+{len(entry['keywords']) - 3})"
table.add_row(
entry["file_name"],
entry["language"] or "-",
(entry["purpose"] or "-")[:30],
keywords_str or "-",
entry["llm_tool"] or "-",
)
console.print(table)
if total_count > len(paginated):
console.print(
f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. "
"Use --offset and --limit for pagination.[/dim]"
)
except StorageError as exc:
if json_mode:
print_json(success=False, error=f"Storage error: {exc}")
else:
console.print(f"[red]Semantic-list failed (storage):[/red] {exc}")
raise typer.Exit(code=1)
except CodexLensError as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Semantic-list failed:[/red] {exc}")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=f"Unexpected error: {exc}")
else:
console.print(f"[red]Semantic-list failed (unexpected):[/red] {exc}")
raise typer.Exit(code=1)
finally:
if registry is not None:
registry.close()

View File

@@ -78,6 +78,11 @@ class Config:
}
)
llm_enabled: bool = False
llm_tool: str = "gemini"
llm_timeout_ms: int = 300000
llm_batch_size: int = 5
def __post_init__(self) -> None:
try:
self.data_dir = self.data_dir.expanduser().resolve()

View File

@@ -30,6 +30,7 @@ class SearchOptions:
total_limit: Total result limit across all directories
include_symbols: Whether to include symbol search results
files_only: Return only file paths without excerpts
include_semantic: Whether to include semantic keyword search results
"""
depth: int = -1
max_workers: int = 8
@@ -37,6 +38,7 @@ class SearchOptions:
total_limit: int = 100
include_symbols: bool = False
files_only: bool = False
include_semantic: bool = False
@dataclass
@@ -378,7 +380,8 @@ class ChainSearchEngine:
idx_path,
query,
options.limit_per_dir,
options.files_only
options.files_only,
options.include_semantic
): idx_path
for idx_path in index_paths
}
@@ -400,7 +403,8 @@ class ChainSearchEngine:
def _search_single_index(self, index_path: Path,
query: str,
limit: int,
files_only: bool = False) -> List[SearchResult]:
files_only: bool = False,
include_semantic: bool = False) -> List[SearchResult]:
"""Search a single index database.
Handles exceptions gracefully, returning empty list on failure.
@@ -410,18 +414,40 @@ class ChainSearchEngine:
query: FTS5 query string
limit: Maximum results from this index
files_only: If True, skip snippet generation for faster search
include_semantic: If True, also search semantic keywords and merge results
Returns:
List of SearchResult objects (empty on error)
"""
try:
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
return store.search_fts(query, limit=limit)
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:
try:
semantic_matches = store.search_semantic_keywords(query)
# Convert semantic matches to SearchResult with 0.8x weight
for file_entry, keywords in semantic_matches:
# Create excerpt from keywords
excerpt = f"Keywords: {', '.join(keywords[:5])}"
# Use a base score of 10.0 for semantic matches, weighted by 0.8
semantic_result = SearchResult(
path=str(file_entry.full_path),
score=10.0 * 0.8,
excerpt=excerpt
)
fts_results.append(semantic_result)
except Exception as sem_exc:
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
return fts_results
except Exception as exc:
self.logger.debug(f"Search error in {index_path}: {exc}")
return []

View File

@@ -32,4 +32,38 @@ def check_semantic_available() -> tuple[bool, str | None]:
"""Check if semantic search dependencies are available."""
return SEMANTIC_AVAILABLE, _import_error
__all__ = ["SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available"]
# Export LLM enhancement classes
try:
from .llm_enhancer import (
LLMEnhancer,
LLMConfig,
SemanticMetadata,
FileData,
EnhancedSemanticIndexer,
create_enhancer,
create_enhanced_indexer,
)
LLM_AVAILABLE = True
except ImportError:
LLM_AVAILABLE = False
LLMEnhancer = None # type: ignore
LLMConfig = None # type: ignore
SemanticMetadata = None # type: ignore
FileData = None # type: ignore
EnhancedSemanticIndexer = None # type: ignore
create_enhancer = None # type: ignore
create_enhanced_indexer = None # type: ignore
__all__ = [
"SEMANTIC_AVAILABLE",
"SEMANTIC_BACKEND",
"check_semantic_available",
"LLM_AVAILABLE",
"LLMEnhancer",
"LLMConfig",
"SemanticMetadata",
"FileData",
"EnhancedSemanticIndexer",
"create_enhancer",
"create_enhanced_indexer",
]

View File

@@ -0,0 +1,667 @@
"""LLM-based semantic enhancement using CCW CLI.
This module provides LLM-generated descriptions that are then embedded
by fastembed for improved semantic search. The flow is:
Code → LLM Summary → fastembed embedding → VectorStore → semantic search
LLM-generated summaries match natural language queries better than raw code.
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
import shutil
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, TYPE_CHECKING
from codexlens.entities import SemanticChunk, Symbol
if TYPE_CHECKING:
from .embedder import Embedder
from .vector_store import VectorStore
logger = logging.getLogger(__name__)
@dataclass
class SemanticMetadata:
"""LLM-generated semantic metadata for a file or symbol."""
summary: str
keywords: List[str]
purpose: str
file_path: Optional[str] = None
symbol_name: Optional[str] = None
llm_tool: Optional[str] = None
@dataclass
class FileData:
"""File data for LLM processing."""
path: str
content: str
language: str
symbols: List[Symbol] = field(default_factory=list)
@dataclass
class LLMConfig:
"""Configuration for LLM enhancement.
Tool selection can be overridden via environment variables:
- CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini)
- CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen)
"""
tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini"))
fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen"))
timeout_ms: int = 300000
batch_size: int = 5
max_content_chars: int = 8000 # Max chars per file in batch prompt
enabled: bool = True
class LLMEnhancer:
"""LLM-based semantic enhancement using CCW CLI.
Generates code summaries and search keywords by calling
external LLM tools (gemini, qwen) via CCW CLI subprocess.
"""
PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files
TASK:
- For each code block, generate a concise summary (1-2 sentences)
- Extract 5-10 relevant search keywords
- Identify the functional purpose/category
MODE: analysis
EXPECTED: JSON format output
=== CODE BLOCKS ===
{code_blocks}
=== OUTPUT FORMAT ===
Return ONLY valid JSON (no markdown, no explanation):
{{
"files": {{
"<file_path>": {{
"summary": "Brief description of what this code does",
"keywords": ["keyword1", "keyword2", ...],
"purpose": "category like: auth, api, util, ui, data, config, test"
}}
}}
}}'''
def __init__(self, config: LLMConfig | None = None) -> None:
"""Initialize LLM enhancer.
Args:
config: LLM configuration, uses defaults if None
"""
self.config = config or LLMConfig()
self._ccw_available: Optional[bool] = None
def check_available(self) -> bool:
"""Check if CCW CLI tool is available."""
if self._ccw_available is not None:
return self._ccw_available
self._ccw_available = shutil.which("ccw") is not None
if not self._ccw_available:
logger.warning("CCW CLI not found in PATH, LLM enhancement disabled")
return self._ccw_available
def enhance_files(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> Dict[str, SemanticMetadata]:
"""Enhance multiple files with LLM-generated semantic metadata.
Processes files in batches to manage token limits and API costs.
Args:
files: List of file data to process
working_dir: Optional working directory for CCW CLI
Returns:
Dict mapping file paths to SemanticMetadata
"""
if not self.config.enabled:
logger.debug("LLM enhancement disabled by config")
return {}
if not self.check_available():
return {}
if not files:
return {}
results: Dict[str, SemanticMetadata] = {}
batch_size = self.config.batch_size
for i in range(0, len(files), batch_size):
batch = files[i:i + batch_size]
try:
batch_results = self._process_batch(batch, working_dir)
results.update(batch_results)
logger.debug(
"Processed batch %d/%d: %d files enhanced",
i // batch_size + 1,
(len(files) + batch_size - 1) // batch_size,
len(batch_results),
)
except Exception as e:
logger.warning(
"Batch %d failed, continuing: %s",
i // batch_size + 1,
e,
)
continue
return results
def enhance_file(
self,
path: str,
content: str,
language: str,
working_dir: Optional[Path] = None,
) -> SemanticMetadata:
"""Enhance a single file with LLM-generated semantic metadata.
Convenience method that wraps enhance_files for single file processing.
Args:
path: File path
content: File content
language: Programming language
working_dir: Optional working directory for CCW CLI
Returns:
SemanticMetadata for the file
Raises:
ValueError: If enhancement fails
"""
file_data = FileData(path=path, content=content, language=language)
results = self.enhance_files([file_data], working_dir)
if path not in results:
# Return default metadata if enhancement failed
return SemanticMetadata(
summary=f"Code file written in {language}",
keywords=[language, "code"],
purpose="unknown",
file_path=path,
llm_tool=self.config.tool,
)
return results[path]
def _process_batch(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> Dict[str, SemanticMetadata]:
"""Process a single batch of files."""
prompt = self._build_batch_prompt(files)
# Try primary tool first
result = self._invoke_ccw_cli(
prompt,
tool=self.config.tool,
working_dir=working_dir,
)
# Fallback to secondary tool if primary fails
if not result["success"] and self.config.fallback_tool:
logger.debug(
"Primary tool %s failed, trying fallback %s",
self.config.tool,
self.config.fallback_tool,
)
result = self._invoke_ccw_cli(
prompt,
tool=self.config.fallback_tool,
working_dir=working_dir,
)
if not result["success"]:
logger.warning("LLM call failed: %s", result.get("stderr", "unknown error"))
return {}
return self._parse_response(result["stdout"], self.config.tool)
def _build_batch_prompt(self, files: List[FileData]) -> str:
"""Build prompt for batch processing."""
code_blocks_parts: List[str] = []
for file_data in files:
# Truncate content if too long
content = file_data.content
if len(content) > self.config.max_content_chars:
content = content[:self.config.max_content_chars] + "\n... [truncated]"
# Format code block
lang_hint = file_data.language or "text"
code_block = f'''[FILE: {file_data.path}]
```{lang_hint}
{content}
```'''
code_blocks_parts.append(code_block)
code_blocks = "\n\n".join(code_blocks_parts)
return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks)
def _invoke_ccw_cli(
self,
prompt: str,
tool: str = "gemini",
working_dir: Optional[Path] = None,
) -> Dict[str, Any]:
"""Invoke CCW CLI tool via subprocess.
Args:
prompt: The prompt to send to LLM
tool: Tool name (gemini, qwen, codex)
working_dir: Optional working directory
Returns:
Dict with success, stdout, stderr, exit_code
"""
import sys
import os
timeout_seconds = (self.config.timeout_ms / 1000) + 30
# Build base arguments
base_args = [
"cli", "exec",
prompt, # Direct string argument
"--tool", tool,
"--mode", "analysis",
"--timeout", str(self.config.timeout_ms),
]
if working_dir:
base_args.extend(["--cd", str(working_dir)])
try:
if sys.platform == "win32":
# On Windows, ccw is a .CMD wrapper that requires shell
# Instead, directly invoke node with the ccw.js script
ccw_path = shutil.which("ccw")
if ccw_path and ccw_path.lower().endswith(".cmd"):
# Find the ccw.js script location
npm_dir = Path(ccw_path).parent
ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js"
if ccw_js.exists():
cmd = ["node", str(ccw_js)] + base_args
else:
# Fallback to shell execution
cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args)
result = subprocess.run(
cmd_str, shell=True, capture_output=True, text=True,
timeout=timeout_seconds, cwd=working_dir,
encoding="utf-8", errors="replace",
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
"exit_code": result.returncode,
}
else:
cmd = ["ccw"] + base_args
else:
cmd = ["ccw"] + base_args
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout_seconds,
cwd=working_dir,
encoding="utf-8",
errors="replace",
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
"exit_code": result.returncode,
}
except subprocess.TimeoutExpired:
logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000)
return {
"success": False,
"stdout": "",
"stderr": "timeout",
"exit_code": -1,
}
except FileNotFoundError:
logger.warning("CCW CLI not found - ensure 'ccw' is in PATH")
return {
"success": False,
"stdout": "",
"stderr": "ccw command not found",
"exit_code": -1,
}
except Exception as e:
logger.warning("CCW CLI invocation failed: %s", e)
return {
"success": False,
"stdout": "",
"stderr": str(e),
"exit_code": -1,
}
def _parse_response(
self,
stdout: str,
tool: str,
) -> Dict[str, SemanticMetadata]:
"""Parse LLM response into SemanticMetadata objects.
Args:
stdout: Raw stdout from CCW CLI
tool: Tool name used for generation
Returns:
Dict mapping file paths to SemanticMetadata
"""
results: Dict[str, SemanticMetadata] = {}
# Extract JSON from response (may be wrapped in markdown or other text)
json_str = self._extract_json(stdout)
if not json_str:
logger.warning("No JSON found in LLM response")
return results
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
logger.warning("Failed to parse LLM response JSON: %s", e)
return results
# Handle expected format: {"files": {"path": {...}}}
files_data = data.get("files", data)
if not isinstance(files_data, dict):
logger.warning("Unexpected response format: expected dict")
return results
for file_path, metadata in files_data.items():
if not isinstance(metadata, dict):
continue
try:
results[file_path] = SemanticMetadata(
summary=metadata.get("summary", ""),
keywords=metadata.get("keywords", []),
purpose=metadata.get("purpose", ""),
file_path=file_path,
llm_tool=tool,
)
except Exception as e:
logger.debug("Failed to parse metadata for %s: %s", file_path, e)
continue
return results
def _extract_json(self, text: str) -> Optional[str]:
"""Extract JSON object from text that may contain markdown or other content."""
# Try to find JSON object boundaries
text = text.strip()
# Remove markdown code blocks if present
if text.startswith("```"):
lines = text.split("\n")
# Remove first line (```json or ```)
lines = lines[1:]
# Find closing ```
for i, line in enumerate(lines):
if line.strip() == "```":
lines = lines[:i]
break
text = "\n".join(lines)
# Find JSON object
start = text.find("{")
if start == -1:
return None
# Find matching closing brace
depth = 0
end = start
for i, char in enumerate(text[start:], start):
if char == "{":
depth += 1
elif char == "}":
depth -= 1
if depth == 0:
end = i + 1
break
if depth != 0:
return None
return text[start:end]
def create_enhancer(
tool: str = "gemini",
timeout_ms: int = 300000,
batch_size: int = 5,
enabled: bool = True,
) -> LLMEnhancer:
"""Factory function to create LLM enhancer with custom config."""
config = LLMConfig(
tool=tool,
timeout_ms=timeout_ms,
batch_size=batch_size,
enabled=enabled,
)
return LLMEnhancer(config)
class EnhancedSemanticIndexer:
"""Integrates LLM enhancement with fastembed vector search.
Flow:
1. Code files → LLM generates summaries/keywords
2. Summaries → fastembed generates embeddings
3. Embeddings → VectorStore for similarity search
This produces better semantic search because:
- LLM summaries are natural language descriptions
- Natural language queries match summaries better than raw code
- Keywords expand search coverage
"""
def __init__(
self,
enhancer: LLMEnhancer,
embedder: "Embedder",
vector_store: "VectorStore",
) -> None:
"""Initialize enhanced semantic indexer.
Args:
enhancer: LLM enhancer for generating summaries
embedder: Fastembed embedder for vector generation
vector_store: Vector storage for similarity search
"""
self.enhancer = enhancer
self.embedder = embedder
self.vector_store = vector_store
def index_files(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> int:
"""Index files with LLM-enhanced semantic search.
Args:
files: List of file data to index
working_dir: Optional working directory for LLM calls
Returns:
Number of files successfully indexed
"""
if not files:
return 0
# Step 1: Generate LLM summaries
logger.info("Generating LLM summaries for %d files...", len(files))
metadata_map = self.enhancer.enhance_files(files, working_dir)
if not metadata_map:
logger.warning("No LLM metadata generated, falling back to raw code")
return self._index_raw_code(files)
# Step 2: Create semantic chunks from LLM summaries
chunks_to_embed: List[SemanticChunk] = []
file_paths: List[str] = []
for file_data in files:
metadata = metadata_map.get(file_data.path)
if metadata:
# Use LLM-generated summary + keywords for embedding
embeddable_text = self._create_embeddable_text(metadata, file_data)
chunk = SemanticChunk(
content=embeddable_text,
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"summary": metadata.summary,
"keywords": metadata.keywords,
"purpose": metadata.purpose,
"llm_tool": metadata.llm_tool,
"strategy": "llm_enhanced",
},
)
else:
# Fallback: use truncated raw code
chunk = SemanticChunk(
content=file_data.content[:2000],
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"strategy": "raw_code",
},
)
chunks_to_embed.append(chunk)
file_paths.append(file_data.path)
# Step 3: Generate embeddings
logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed))
texts = [chunk.content for chunk in chunks_to_embed]
embeddings = self.embedder.embed(texts)
# Step 4: Store in vector store
indexed_count = 0
for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths):
chunk.embedding = embedding
try:
self.vector_store.add_chunk(chunk, file_path)
indexed_count += 1
except Exception as e:
logger.debug("Failed to store chunk for %s: %s", file_path, e)
logger.info("Successfully indexed %d/%d files", indexed_count, len(files))
return indexed_count
def _create_embeddable_text(
self,
metadata: SemanticMetadata,
file_data: FileData,
) -> str:
"""Create text optimized for embedding from LLM metadata.
Combines summary, keywords, and purpose into a single string
that will produce good semantic matches for natural language queries.
"""
parts = []
# Summary is the primary content
if metadata.summary:
parts.append(metadata.summary)
# Purpose adds categorical context
if metadata.purpose:
parts.append(f"Category: {metadata.purpose}")
# Keywords expand search coverage
if metadata.keywords:
parts.append(f"Keywords: {', '.join(metadata.keywords)}")
# Add file name for context
parts.append(f"File: {Path(file_data.path).name}")
return "\n".join(parts)
def _index_raw_code(self, files: List[FileData]) -> int:
"""Fallback: index raw code without LLM enhancement."""
indexed_count = 0
for file_data in files:
# Truncate to reasonable size
content = file_data.content[:2000]
chunk = SemanticChunk(
content=content,
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"strategy": "raw_code",
},
)
try:
embedding = self.embedder.embed_single(content)
chunk.embedding = embedding
self.vector_store.add_chunk(chunk, file_data.path)
indexed_count += 1
except Exception as e:
logger.debug("Failed to index %s: %s", file_data.path, e)
return indexed_count
def create_enhanced_indexer(
vector_store_path: Path,
llm_tool: str = "gemini",
llm_enabled: bool = True,
) -> EnhancedSemanticIndexer:
"""Factory function to create an enhanced semantic indexer.
Args:
vector_store_path: Path for the vector store database
llm_tool: LLM tool to use (gemini, qwen)
llm_enabled: Whether to enable LLM enhancement
Returns:
Configured EnhancedSemanticIndexer instance
"""
from .embedder import Embedder
from .vector_store import VectorStore
enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled)
embedder = Embedder()
vector_store = VectorStore(vector_store_path)
return EnhancedSemanticIndexer(enhancer, embedder, vector_store)

View File

@@ -347,6 +347,222 @@ class DirIndexStore:
row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
return int(row["c"]) if row else 0
# === Semantic Metadata ===
def add_semantic_metadata(
self,
file_id: int,
summary: str,
keywords: List[str],
purpose: str,
llm_tool: str
) -> None:
"""Add or update semantic metadata for a file.
Args:
file_id: File ID from files table
summary: LLM-generated summary
keywords: List of keywords
purpose: Purpose/role of the file
llm_tool: Tool used to generate metadata (gemini/qwen)
"""
with self._lock:
conn = self._get_connection()
import json
import time
keywords_json = json.dumps(keywords)
generated_at = time.time()
conn.execute(
"""
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
VALUES(?, ?, ?, ?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
summary=excluded.summary,
keywords=excluded.keywords,
purpose=excluded.purpose,
llm_tool=excluded.llm_tool,
generated_at=excluded.generated_at
""",
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
)
conn.commit()
def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
"""Get semantic metadata for a file.
Args:
file_id: File ID from files table
Returns:
Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found
"""
with self._lock:
conn = self._get_connection()
row = conn.execute(
"""
SELECT summary, keywords, purpose, llm_tool, generated_at
FROM semantic_metadata WHERE file_id=?
""",
(file_id,),
).fetchone()
if not row:
return None
import json
return {
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
}
def get_files_without_semantic(self) -> List[FileEntry]:
"""Get all files that don't have semantic metadata.
Returns:
List of FileEntry objects without semantic metadata
"""
with self._lock:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
FROM files f
LEFT JOIN semantic_metadata sm ON f.id = sm.file_id
WHERE sm.id IS NULL
ORDER BY f.name
"""
).fetchall()
return [
FileEntry(
id=int(row["id"]),
name=row["name"],
full_path=Path(row["full_path"]),
language=row["language"],
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
line_count=int(row["line_count"]) if row["line_count"] else 0,
)
for row in rows
]
def search_semantic_keywords(self, keyword: str) -> List[Tuple[FileEntry, List[str]]]:
"""Search files by semantic keywords.
Args:
keyword: Keyword to search for (case-insensitive)
Returns:
List of (FileEntry, keywords) tuples where keyword matches
"""
with self._lock:
conn = self._get_connection()
keyword_pattern = f"%{keyword}%"
rows = conn.execute(
"""
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
WHERE sm.keywords LIKE ? COLLATE NOCASE
ORDER BY f.name
""",
(keyword_pattern,),
).fetchall()
import json
results = []
for row in rows:
file_entry = FileEntry(
id=int(row["id"]),
name=row["name"],
full_path=Path(row["full_path"]),
language=row["language"],
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
line_count=int(row["line_count"]) if row["line_count"] else 0,
)
keywords = json.loads(row["keywords"]) if row["keywords"] else []
results.append((file_entry, keywords))
return results
def list_semantic_metadata(
self,
offset: int = 0,
limit: int = 50,
llm_tool: Optional[str] = None,
) -> Tuple[List[Dict[str, Any]], int]:
"""List all semantic metadata with file information.
Args:
offset: Number of records to skip (for pagination)
limit: Maximum records to return (max 100)
llm_tool: Optional filter by LLM tool used
Returns:
Tuple of (list of metadata dicts, total count)
"""
import json
with self._lock:
conn = self._get_connection()
base_query = """
SELECT f.id as file_id, f.name as file_name, f.full_path,
f.language, f.line_count,
sm.summary, sm.keywords, sm.purpose,
sm.llm_tool, sm.generated_at
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
"""
count_query = """
SELECT COUNT(*) as total
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
"""
params: List[Any] = []
if llm_tool:
base_query += " WHERE sm.llm_tool = ?"
count_query += " WHERE sm.llm_tool = ?"
params.append(llm_tool)
base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?"
params.extend([min(limit, 100), offset])
count_params = [llm_tool] if llm_tool else []
total_row = conn.execute(count_query, count_params).fetchone()
total = int(total_row["total"]) if total_row else 0
rows = conn.execute(base_query, params).fetchall()
results = []
for row in rows:
results.append({
"file_id": int(row["file_id"]),
"file_name": row["file_name"],
"full_path": row["full_path"],
"language": row["language"],
"line_count": int(row["line_count"]) if row["line_count"] else 0,
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
})
return results, total
# === Subdirectory Links ===
def register_subdir(
@@ -748,12 +964,28 @@ class DirIndexStore:
"""
)
# Semantic metadata table
conn.execute(
"""
CREATE TABLE IF NOT EXISTS semantic_metadata (
id INTEGER PRIMARY KEY,
file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
summary TEXT,
keywords TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL
)
"""
)
# Indexes
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to create schema: {exc}") from exc