Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation.
- Added parameterized tests to validate expected token outputs for different query formats.
- Created edge case tests to ensure robustness against unusual input scenarios.
- Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources.
- Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
catlog22
2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions

View File

@@ -20,6 +20,7 @@ from codexlens.parsers.factory import ParserFactory
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore, ProjectInfo
from codexlens.storage.index_tree import IndexTreeBuilder
from codexlens.storage.dir_index import DirIndexStore
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from .output import (
@@ -77,6 +78,7 @@ def init(
help="Limit indexing to specific languages (repeat or comma-separated).",
),
workers: int = typer.Option(4, "--workers", "-w", min=1, max=16, help="Parallel worker processes."),
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
@@ -84,6 +86,9 @@ def init(
Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure.
Set CODEXLENS_INDEX_DIR to customize the index location.
By default, uses incremental indexing (skip unchanged files).
Use --force to rebuild all files regardless of modification time.
"""
_configure_logging(verbose)
config = Config()
@@ -96,14 +101,18 @@ def init(
registry.initialize()
mapper = PathMapper()
builder = IndexTreeBuilder(registry, mapper, config)
builder = IndexTreeBuilder(registry, mapper, config, incremental=not force)
console.print(f"[bold]Building index for:[/bold] {base_path}")
if force:
console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]")
else:
console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]")
build_result = builder.build(
source_root=base_path,
languages=languages,
workers=workers,
force_full=force,
)
result = {
@@ -172,6 +181,8 @@ def search(
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
@@ -179,10 +190,51 @@ def search(
Uses chain search across directory indexes.
Use --depth to limit search recursion (0 = current dir only).
Search Modes:
- exact: Exact FTS using unicode61 tokenizer (default)
- fuzzy: Fuzzy FTS using trigram tokenizer
- hybrid: RRF fusion of exact + fuzzy (recommended)
- vector: Semantic vector search (future)
Hybrid Mode:
Default weights: exact=0.4, fuzzy=0.3, vector=0.3
Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
"""
_configure_logging(verbose)
search_path = path.expanduser().resolve()
# Validate mode
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
if mode not in valid_modes:
if json_mode:
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
else:
console.print(f"[red]Invalid mode:[/red] {mode}")
console.print(f"[dim]Valid modes: {', '.join(valid_modes)}[/dim]")
raise typer.Exit(code=1)
# Parse custom weights if provided
hybrid_weights = None
if weights:
try:
weight_parts = [float(w.strip()) for w in weights.split(",")]
if len(weight_parts) == 3:
weight_sum = sum(weight_parts)
if abs(weight_sum - 1.0) > 0.01:
console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
# Normalize weights
weight_parts = [w / weight_sum for w in weight_parts]
hybrid_weights = {
"exact": weight_parts[0],
"fuzzy": weight_parts[1],
"vector": weight_parts[2],
}
else:
console.print("[yellow]Warning: Invalid weights format (need 3 values). Using defaults.[/yellow]")
except ValueError:
console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
registry: RegistryStore | None = None
try:
registry = RegistryStore()
@@ -190,10 +242,18 @@ def search(
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
# Map mode to options
hybrid_mode = mode == "hybrid"
enable_fuzzy = mode in ["fuzzy", "hybrid"]
options = SearchOptions(
depth=depth,
total_limit=limit,
files_only=files_only,
hybrid_mode=hybrid_mode,
enable_fuzzy=enable_fuzzy,
hybrid_weights=hybrid_weights,
)
if files_only:
@@ -208,8 +268,17 @@ def search(
result = engine.search(query, search_path, options)
payload = {
"query": query,
"mode": mode,
"count": len(result.results),
"results": [{"path": r.path, "score": r.score, "excerpt": r.excerpt} for r in result.results],
"results": [
{
"path": r.path,
"score": r.score,
"excerpt": r.excerpt,
"source": getattr(r, "search_source", None),
}
for r in result.results
],
"stats": {
"dirs_searched": result.stats.dirs_searched,
"files_matched": result.stats.files_matched,
@@ -219,9 +288,8 @@ def search(
if json_mode:
print_json(success=True, result=payload)
else:
render_search_results(result.results)
if verbose:
console.print(f"[dim]Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
render_search_results(result.results, verbose=verbose)
console.print(f"[dim]Mode: {mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
except SearchError as exc:
if json_mode:
@@ -404,6 +472,27 @@ def status(
if f.is_file():
index_size += f.stat().st_size
# Check schema version and enabled features
schema_version = None
has_dual_fts = False
if projects and index_root.exists():
# Check first index database for features
index_files = list(index_root.rglob("_index.db"))
if index_files:
try:
with DirIndexStore(index_files[0]) as store:
with store._lock:
conn = store._get_connection()
schema_version = store._get_schema_version(conn)
# Check if dual FTS tables exist
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name IN ('search_fts_exact', 'search_fts_fuzzy')"
)
fts_tables = [row[0] for row in cursor.fetchall()]
has_dual_fts = len(fts_tables) == 2
except Exception:
pass
stats = {
"index_root": str(index_root),
"registry_path": str(_get_registry_path()),
@@ -412,6 +501,13 @@ def status(
"total_dirs": total_dirs,
"index_size_bytes": index_size,
"index_size_mb": round(index_size / (1024 * 1024), 2),
"schema_version": schema_version,
"features": {
"exact_fts": True, # Always available
"fuzzy_fts": has_dual_fts,
"hybrid_search": has_dual_fts,
"vector_search": False, # Not yet implemented
},
}
if json_mode:
@@ -424,6 +520,17 @@ def status(
console.print(f" Total Files: {stats['total_files']}")
console.print(f" Total Directories: {stats['total_dirs']}")
console.print(f" Index Size: {stats['index_size_mb']} MB")
if schema_version:
console.print(f" Schema Version: {schema_version}")
console.print("\n[bold]Search Backends:[/bold]")
console.print(f" Exact FTS: ✓ (unicode61)")
if has_dual_fts:
console.print(f" Fuzzy FTS: ✓ (trigram)")
console.print(f" Hybrid Search: ✓ (RRF fusion)")
else:
console.print(f" Fuzzy FTS: ✗ (run 'migrate' to enable)")
console.print(f" Hybrid Search: ✗ (run 'migrate' to enable)")
console.print(f" Vector Search: ✗ (future)")
except StorageError as exc:
if json_mode:
@@ -778,6 +885,139 @@ def config(
raise typer.Exit(code=1)
@app.command()
def migrate(
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""Migrate project indexes to latest schema (Dual-FTS upgrade).
Upgrades all _index.db files in the project to schema version 4, which includes:
- Dual FTS tables (exact + fuzzy)
- Encoding detection support
- Incremental indexing metadata
This is a safe operation that preserves all existing data.
Progress is shown during migration.
"""
_configure_logging(verbose)
base_path = path.expanduser().resolve()
registry: RegistryStore | None = None
try:
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
# Find project
project_info = registry.get_project(base_path)
if not project_info:
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
index_dir = mapper.source_to_index_dir(base_path)
if not index_dir.exists():
raise CodexLensError(f"Index directory not found: {index_dir}")
# Find all _index.db files
index_files = list(index_dir.rglob("_index.db"))
if not index_files:
if json_mode:
print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0})
else:
console.print("[yellow]No indexes found to migrate.[/yellow]")
return
migrated_count = 0
error_count = 0
already_migrated = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TextColumn("({task.completed}/{task.total})"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files))
for db_path in index_files:
try:
store = DirIndexStore(db_path)
# Check current version
with store._lock:
conn = store._get_connection()
current_version = store._get_schema_version(conn)
if current_version >= DirIndexStore.SCHEMA_VERSION:
already_migrated += 1
if verbose:
progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]")
elif current_version > 0:
# Apply migrations
store._apply_migrations(conn, current_version)
store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION)
conn.commit()
migrated_count += 1
if verbose:
progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]")
else:
# New database, initialize directly
store.initialize()
migrated_count += 1
store.close()
except Exception as e:
error_count += 1
if verbose:
progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]")
progress.update(task, advance=1)
result = {
"path": str(base_path),
"total_indexes": len(index_files),
"migrated": migrated_count,
"already_migrated": already_migrated,
"errors": error_count,
}
if json_mode:
print_json(success=True, result=result)
else:
console.print(f"[green]Migration complete:[/green]")
console.print(f" Total indexes: {len(index_files)}")
console.print(f" Migrated: {migrated_count}")
console.print(f" Already up-to-date: {already_migrated}")
if error_count > 0:
console.print(f" [yellow]Errors: {error_count}[/yellow]")
except StorageError as exc:
if json_mode:
print_json(success=False, error=f"Storage error: {exc}")
else:
console.print(f"[red]Migration failed (storage):[/red] {exc}")
raise typer.Exit(code=1)
except CodexLensError as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Migration failed:[/red] {exc}")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=f"Unexpected error: {exc}")
else:
console.print(f"[red]Migration failed (unexpected):[/red] {exc}")
raise typer.Exit(code=1)
finally:
if registry is not None:
registry.close()
@app.command()

View File

@@ -41,15 +41,45 @@ def print_json(*, success: bool, result: Any = None, error: str | None = None) -
console.print_json(json.dumps(payload, ensure_ascii=False))
def render_search_results(results: Sequence[SearchResult], *, title: str = "Search Results") -> None:
def render_search_results(
results: Sequence[SearchResult], *, title: str = "Search Results", verbose: bool = False
) -> None:
"""Render search results with optional source tags in verbose mode.
Args:
results: Search results to display
title: Table title
verbose: If True, show search source tags ([E], [F], [V]) and fusion scores
"""
table = Table(title=title, show_lines=False)
if verbose:
# Verbose mode: show source tags
table.add_column("Source", style="dim", width=6, justify="center")
table.add_column("Path", style="cyan", no_wrap=True)
table.add_column("Score", style="magenta", justify="right")
table.add_column("Excerpt", style="white")
for res in results:
excerpt = res.excerpt or ""
table.add_row(res.path, f"{res.score:.3f}", excerpt)
score_str = f"{res.score:.3f}"
if verbose:
# Extract search source tag if available
source = getattr(res, "search_source", None)
source_tag = ""
if source == "exact":
source_tag = "[E]"
elif source == "fuzzy":
source_tag = "[F]"
elif source == "vector":
source_tag = "[V]"
elif source == "fusion":
source_tag = "[RRF]"
table.add_row(source_tag, res.path, score_str, excerpt)
else:
table.add_row(res.path, score_str, excerpt)
console.print(table)

View File

@@ -0,0 +1,202 @@
"""Optional encoding detection module for CodexLens.
Provides automatic encoding detection with graceful fallback to UTF-8.
Install with: pip install codexlens[encoding]
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Tuple, Optional
log = logging.getLogger(__name__)
# Feature flag for encoding detection availability
ENCODING_DETECTION_AVAILABLE = False
_import_error: Optional[str] = None
def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
"""Detect if chardet or charset-normalizer is available."""
try:
import chardet
return True, None
except ImportError:
pass
try:
from charset_normalizer import from_bytes
return True, None
except ImportError:
pass
return False, "chardet not available. Install with: pip install codexlens[encoding]"
# Initialize on module load
ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
def check_encoding_available() -> Tuple[bool, Optional[str]]:
"""Check if encoding detection dependencies are available.
Returns:
Tuple of (available, error_message)
"""
return ENCODING_DETECTION_AVAILABLE, _import_error
def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
"""Detect encoding from file content bytes.
Uses chardet or charset-normalizer with configurable confidence threshold.
Falls back to UTF-8 if confidence is too low or detection unavailable.
Args:
content_bytes: Raw file content as bytes
confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
Returns:
Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
Returns 'utf-8' as fallback if detection fails or confidence too low
"""
if not ENCODING_DETECTION_AVAILABLE:
log.debug("Encoding detection not available, using UTF-8 fallback")
return "utf-8"
if not content_bytes:
return "utf-8"
try:
# Try chardet first
try:
import chardet
result = chardet.detect(content_bytes)
encoding = result.get("encoding")
confidence = result.get("confidence", 0.0)
if encoding and confidence >= confidence_threshold:
log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
# Normalize encoding name: replace underscores with hyphens
return encoding.lower().replace('_', '-')
else:
log.debug(
f"Low confidence encoding detection: {encoding} "
f"(confidence: {confidence:.2f}), using UTF-8 fallback"
)
return "utf-8"
except ImportError:
pass
# Fallback to charset-normalizer
try:
from charset_normalizer import from_bytes
results = from_bytes(content_bytes)
if results:
best = results.best()
if best and best.encoding:
log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
# Normalize encoding name: replace underscores with hyphens
return best.encoding.lower().replace('_', '-')
except ImportError:
pass
except Exception as e:
log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
return "utf-8"
def read_file_safe(
path: Path | str,
confidence_threshold: float = 0.7,
max_detection_bytes: int = 100_000
) -> Tuple[str, str]:
"""Read file with automatic encoding detection and safe decoding.
Reads file bytes, detects encoding, and decodes with error replacement
to preserve file structure even with encoding issues.
Args:
path: Path to file to read
confidence_threshold: Minimum confidence for encoding detection
max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
Returns:
Tuple of (content, detected_encoding)
- content: Decoded file content (with <20> for unmappable bytes)
- detected_encoding: Detected encoding name
Raises:
OSError: If file cannot be read
IsADirectoryError: If path is a directory
"""
file_path = Path(path) if isinstance(path, str) else path
# Read file bytes
try:
content_bytes = file_path.read_bytes()
except Exception as e:
log.error(f"Failed to read file {file_path}: {e}")
raise
# Detect encoding from first N bytes for performance
detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
encoding = detect_encoding(detection_sample, confidence_threshold)
# Decode with error replacement to preserve structure
try:
content = content_bytes.decode(encoding, errors='replace')
log.debug(f"Successfully decoded {file_path} using {encoding}")
return content, encoding
except Exception as e:
# Final fallback to UTF-8 with replacement
log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
content = content_bytes.decode('utf-8', errors='replace')
return content, 'utf-8'
def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
"""Check if file is likely binary by sampling first bytes.
Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
Args:
path: Path to file to check
sample_size: Number of bytes to sample (default 8KB)
Returns:
True if file appears to be binary, False otherwise
"""
file_path = Path(path) if isinstance(path, str) else path
try:
with file_path.open('rb') as f:
sample = f.read(sample_size)
if not sample:
return False
# Count null bytes and non-printable characters
null_count = sample.count(b'\x00')
non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
# If >30% null bytes or >50% non-text, consider binary
null_ratio = null_count / len(sample)
non_text_ratio = non_text_count / len(sample)
return null_ratio > 0.3 or non_text_ratio > 0.5
except Exception as e:
log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
return False
__all__ = [
"ENCODING_DETECTION_AVAILABLE",
"check_encoding_available",
"detect_encoding",
"read_file_safe",
"is_binary_file",
]

View File

@@ -18,6 +18,7 @@ from codexlens.storage.registry import RegistryStore, DirMapping
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.sqlite_store import SQLiteStore
from codexlens.search.hybrid_search import HybridSearchEngine
@dataclass
@@ -32,6 +33,9 @@ class SearchOptions:
include_symbols: Whether to include symbol search results
files_only: Return only file paths without excerpts
include_semantic: Whether to include semantic keyword search results
hybrid_mode: Enable hybrid search with RRF fusion (default False)
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
hybrid_weights: Custom RRF weights for hybrid search (optional)
"""
depth: int = -1
max_workers: int = 8
@@ -40,6 +44,9 @@ class SearchOptions:
include_symbols: bool = False
files_only: bool = False
include_semantic: bool = False
hybrid_mode: bool = False
enable_fuzzy: bool = True
hybrid_weights: Optional[Dict[str, float]] = None
@dataclass
@@ -484,7 +491,10 @@ class ChainSearchEngine:
query,
options.limit_per_dir,
options.files_only,
options.include_semantic
options.include_semantic,
options.hybrid_mode,
options.enable_fuzzy,
options.hybrid_weights
): idx_path
for idx_path in index_paths
}
@@ -507,7 +517,10 @@ class ChainSearchEngine:
query: str,
limit: int,
files_only: bool = False,
include_semantic: bool = False) -> List[SearchResult]:
include_semantic: bool = False,
hybrid_mode: bool = False,
enable_fuzzy: bool = True,
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
"""Search a single index database.
Handles exceptions gracefully, returning empty list on failure.
@@ -518,39 +531,54 @@ class ChainSearchEngine:
limit: Maximum results from this index
files_only: If True, skip snippet generation for faster search
include_semantic: If True, also search semantic keywords and merge results
hybrid_mode: If True, use hybrid search with RRF fusion
enable_fuzzy: Enable fuzzy FTS in hybrid mode
hybrid_weights: Custom RRF weights for hybrid search
Returns:
List of SearchResult objects (empty on error)
"""
try:
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:
try:
semantic_matches = store.search_semantic_keywords(query)
# Convert semantic matches to SearchResult with 0.8x weight
for file_entry, keywords in semantic_matches:
# Create excerpt from keywords
excerpt = f"Keywords: {', '.join(keywords[:5])}"
# Use a base score of 10.0 for semantic matches, weighted by 0.8
semantic_result = SearchResult(
path=str(file_entry.full_path),
score=10.0 * 0.8,
excerpt=excerpt
)
fts_results.append(semantic_result)
except Exception as sem_exc:
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
return fts_results
# Use hybrid search if enabled
if hybrid_mode:
hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
fts_results = hybrid_engine.search(
index_path,
query,
limit=limit,
enable_fuzzy=enable_fuzzy,
enable_vector=False, # Vector search not yet implemented
)
else:
# Legacy single-FTS search
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:
try:
semantic_matches = store.search_semantic_keywords(query)
# Convert semantic matches to SearchResult with 0.8x weight
for file_entry, keywords in semantic_matches:
# Create excerpt from keywords
excerpt = f"Keywords: {', '.join(keywords[:5])}"
# Use a base score of 10.0 for semantic matches, weighted by 0.8
semantic_result = SearchResult(
path=str(file_entry.full_path),
score=10.0 * 0.8,
excerpt=excerpt
)
fts_results.append(semantic_result)
except Exception as sem_exc:
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
return fts_results
except Exception as exc:
self.logger.debug(f"Search error in {index_path}: {exc}")
return []

View File

@@ -0,0 +1,211 @@
"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
results via Reciprocal Rank Fusion (RRF) algorithm.
"""
from __future__ import annotations
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Optional
from codexlens.entities import SearchResult
from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
from codexlens.storage.dir_index import DirIndexStore
class HybridSearchEngine:
"""Hybrid search engine with parallel execution and RRF fusion.
Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
executing them in parallel and fusing results via Reciprocal Rank Fusion.
Attributes:
logger: Python logger instance
default_weights: Default RRF weights for each source
"""
# Default RRF weights (exact: 40%, fuzzy: 30%, vector: 30%)
DEFAULT_WEIGHTS = {
"exact": 0.4,
"fuzzy": 0.3,
"vector": 0.3,
}
def __init__(self, weights: Optional[Dict[str, float]] = None):
"""Initialize hybrid search engine.
Args:
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
"""
self.logger = logging.getLogger(__name__)
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
def search(
self,
index_path: Path,
query: str,
limit: int = 20,
enable_fuzzy: bool = True,
enable_vector: bool = False,
) -> List[SearchResult]:
"""Execute hybrid search with parallel retrieval and RRF fusion.
Args:
index_path: Path to _index.db file
query: FTS5 query string
limit: Maximum results to return after fusion
enable_fuzzy: Enable fuzzy FTS search (default True)
enable_vector: Enable vector search (default False)
Returns:
List of SearchResult objects sorted by fusion score
Examples:
>>> engine = HybridSearchEngine()
>>> results = engine.search(Path("project/_index.db"), "authentication")
>>> for r in results[:5]:
... print(f"{r.path}: {r.score:.3f}")
"""
# Determine which backends to use
backends = {"exact": True} # Always use exact search
if enable_fuzzy:
backends["fuzzy"] = True
if enable_vector:
backends["vector"] = True
# Execute parallel searches
results_map = self._search_parallel(index_path, query, backends, limit)
# Apply RRF fusion
# Filter weights to only active backends
active_weights = {
source: weight
for source, weight in self.weights.items()
if source in results_map
}
fused_results = reciprocal_rank_fusion(results_map, active_weights)
# Apply final limit
return fused_results[:limit]
def _search_parallel(
self,
index_path: Path,
query: str,
backends: Dict[str, bool],
limit: int,
) -> Dict[str, List[SearchResult]]:
"""Execute parallel searches across enabled backends.
Args:
index_path: Path to _index.db file
query: FTS5 query string
backends: Dictionary of backend name to enabled flag
limit: Results limit per backend
Returns:
Dictionary mapping source name to results list
"""
results_map: Dict[str, List[SearchResult]] = {}
# Use ThreadPoolExecutor for parallel I/O-bound searches
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
# Submit search tasks
future_to_source = {}
if backends.get("exact"):
future = executor.submit(
self._search_exact, index_path, query, limit
)
future_to_source[future] = "exact"
if backends.get("fuzzy"):
future = executor.submit(
self._search_fuzzy, index_path, query, limit
)
future_to_source[future] = "fuzzy"
if backends.get("vector"):
future = executor.submit(
self._search_vector, index_path, query, limit
)
future_to_source[future] = "vector"
# Collect results as they complete
for future in as_completed(future_to_source):
source = future_to_source[future]
try:
results = future.result()
# Tag results with source for debugging
tagged_results = tag_search_source(results, source)
results_map[source] = tagged_results
self.logger.debug(
"Got %d results from %s search", len(results), source
)
except Exception as exc:
self.logger.error("Search failed for %s: %s", source, exc)
results_map[source] = []
return results_map
def _search_exact(
self, index_path: Path, query: str, limit: int
) -> List[SearchResult]:
"""Execute exact FTS search using unicode61 tokenizer.
Args:
index_path: Path to _index.db file
query: FTS5 query string
limit: Maximum results
Returns:
List of SearchResult objects
"""
try:
with DirIndexStore(index_path) as store:
return store.search_fts_exact(query, limit=limit)
except Exception as exc:
self.logger.debug("Exact search error: %s", exc)
return []
def _search_fuzzy(
self, index_path: Path, query: str, limit: int
) -> List[SearchResult]:
"""Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
Args:
index_path: Path to _index.db file
query: FTS5 query string
limit: Maximum results
Returns:
List of SearchResult objects
"""
try:
with DirIndexStore(index_path) as store:
return store.search_fts_fuzzy(query, limit=limit)
except Exception as exc:
self.logger.debug("Fuzzy search error: %s", exc)
return []
def _search_vector(
self, index_path: Path, query: str, limit: int
) -> List[SearchResult]:
"""Execute vector search (placeholder for future implementation).
Args:
index_path: Path to _index.db file
query: Query string
limit: Maximum results
Returns:
List of SearchResult objects (empty for now)
"""
# Placeholder for vector search integration
# Will be implemented when VectorStore is available
self.logger.debug("Vector search not yet implemented")
return []

View File

@@ -0,0 +1,242 @@
"""Query preprocessing for CodexLens search.
Provides query expansion for better identifier matching:
- CamelCase splitting: UserAuth → User OR Auth
- snake_case splitting: user_auth → user OR auth
- Preserves original query for exact matching
"""
from __future__ import annotations
import logging
import re
from typing import Set, List
log = logging.getLogger(__name__)
class QueryParser:
"""Parser for preprocessing search queries before FTS5 execution.
Expands identifier-style queries (CamelCase, snake_case) into OR queries
to improve recall when searching for code symbols.
Example transformations:
- 'UserAuth''UserAuth OR User OR Auth'
- 'user_auth''user_auth OR user OR auth'
- 'getUserData''getUserData OR get OR User OR Data'
"""
# Patterns for identifier splitting
CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
SNAKE_CASE_PATTERN = re.compile(r'_+')
KEBAB_CASE_PATTERN = re.compile(r'-+')
# Minimum token length to include in expansion (avoid noise from single chars)
MIN_TOKEN_LENGTH = 2
# All-caps acronyms pattern (e.g., HTTP, SQL, API)
ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
def __init__(self, enable: bool = True, min_token_length: int = 2):
"""Initialize query parser.
Args:
enable: Whether to enable query preprocessing
min_token_length: Minimum token length to include in expansion
"""
self.enable = enable
self.min_token_length = min_token_length
def preprocess_query(self, query: str) -> str:
"""Preprocess query with identifier expansion.
Args:
query: Original search query
Returns:
Expanded query with OR operator connecting original and split tokens
Example:
>>> parser = QueryParser()
>>> parser.preprocess_query('UserAuth')
'UserAuth OR User OR Auth'
>>> parser.preprocess_query('get_user_data')
'get_user_data OR get OR user OR data'
"""
if not self.enable:
return query
query = query.strip()
if not query:
return query
# Extract tokens from query (handle multiple words/terms)
# For simple queries, just process the whole thing
# For complex FTS5 queries with operators, preserve structure
if self._is_simple_query(query):
return self._expand_simple_query(query)
else:
# Complex query with FTS5 operators, don't expand
log.debug(f"Skipping expansion for complex FTS5 query: {query}")
return query
def _is_simple_query(self, query: str) -> bool:
"""Check if query is simple (no FTS5 operators).
Args:
query: Search query
Returns:
True if query is simple (safe to expand), False otherwise
"""
# Check for FTS5 operators that indicate complex query
fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
return not any(op in query for op in fts5_operators)
def _expand_simple_query(self, query: str) -> str:
"""Expand a simple query with identifier splitting.
Args:
query: Simple search query
Returns:
Expanded query with OR operators
"""
tokens: Set[str] = set()
# Always include original query
tokens.add(query)
# Split on whitespace first
words = query.split()
for word in words:
# Extract tokens from this word
word_tokens = self._extract_tokens(word)
tokens.update(word_tokens)
# Filter out short tokens and duplicates
filtered_tokens = [
t for t in tokens
if len(t) >= self.min_token_length
]
# Remove duplicates while preserving original query first
unique_tokens: List[str] = []
seen: Set[str] = set()
# Always put original query first
if query not in seen and len(query) >= self.min_token_length:
unique_tokens.append(query)
seen.add(query)
# Add other tokens
for token in filtered_tokens:
if token not in seen:
unique_tokens.append(token)
seen.add(token)
# Join with OR operator (only if we have multiple tokens)
if len(unique_tokens) > 1:
expanded = ' OR '.join(unique_tokens)
log.debug(f"Expanded query: '{query}''{expanded}'")
return expanded
else:
return query
def _extract_tokens(self, word: str) -> Set[str]:
"""Extract tokens from a single word using various splitting strategies.
Args:
word: Single word/identifier to split
Returns:
Set of extracted tokens
"""
tokens: Set[str] = set()
# Add original word
tokens.add(word)
# Handle all-caps acronyms (don't split)
if self.ALL_CAPS_PATTERN.match(word):
return tokens
# CamelCase splitting
camel_tokens = self._split_camel_case(word)
tokens.update(camel_tokens)
# snake_case splitting
snake_tokens = self._split_snake_case(word)
tokens.update(snake_tokens)
# kebab-case splitting
kebab_tokens = self._split_kebab_case(word)
tokens.update(kebab_tokens)
return tokens
def _split_camel_case(self, word: str) -> List[str]:
"""Split CamelCase identifier into tokens.
Args:
word: CamelCase identifier (e.g., 'getUserData')
Returns:
List of tokens (e.g., ['get', 'User', 'Data'])
"""
# Insert space before uppercase letters preceded by lowercase
spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
# Split on spaces and filter empty
return [t for t in spaced.split() if t]
def _split_snake_case(self, word: str) -> List[str]:
"""Split snake_case identifier into tokens.
Args:
word: snake_case identifier (e.g., 'get_user_data')
Returns:
List of tokens (e.g., ['get', 'user', 'data'])
"""
# Split on underscores
return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
def _split_kebab_case(self, word: str) -> List[str]:
"""Split kebab-case identifier into tokens.
Args:
word: kebab-case identifier (e.g., 'get-user-data')
Returns:
List of tokens (e.g., ['get', 'user', 'data'])
"""
# Split on hyphens
return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
# Global default parser instance
_default_parser = QueryParser(enable=True)
def preprocess_query(query: str, enable: bool = True) -> str:
"""Convenience function for query preprocessing.
Args:
query: Original search query
enable: Whether to enable preprocessing
Returns:
Preprocessed query with identifier expansion
"""
if not enable:
return query
return _default_parser.preprocess_query(query)
__all__ = [
"QueryParser",
"preprocess_query",
]

View File

@@ -0,0 +1,160 @@
"""Ranking algorithms for hybrid search result fusion.
Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
"""
from __future__ import annotations
import math
from typing import Dict, List
from codexlens.entities import SearchResult
def reciprocal_rank_fusion(
results_map: Dict[str, List[SearchResult]],
weights: Dict[str, float] = None,
k: int = 60,
) -> List[SearchResult]:
"""Combine search results from multiple sources using Reciprocal Rank Fusion.
RRF formula: score(d) = Σ weight_source / (k + rank_source(d))
Args:
results_map: Dictionary mapping source name to list of SearchResult objects
Sources: 'exact', 'fuzzy', 'vector'
weights: Dictionary mapping source name to weight (default: equal weights)
Example: {'exact': 0.4, 'fuzzy': 0.3, 'vector': 0.3}
k: Constant to avoid division by zero and control rank influence (default 60)
Returns:
List of SearchResult objects sorted by fused score (descending)
Examples:
>>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
>>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
>>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
>>> fused = reciprocal_rank_fusion(results_map)
"""
if not results_map:
return []
# Default equal weights if not provided
if weights is None:
num_sources = len(results_map)
weights = {source: 1.0 / num_sources for source in results_map}
# Validate weights sum to 1.0
weight_sum = sum(weights.values())
if not math.isclose(weight_sum, 1.0, abs_tol=0.01):
# Normalize weights to sum to 1.0
weights = {source: w / weight_sum for source, w in weights.items()}
# Build unified result set with RRF scores
path_to_result: Dict[str, SearchResult] = {}
path_to_fusion_score: Dict[str, float] = {}
for source_name, results in results_map.items():
weight = weights.get(source_name, 0.0)
if weight == 0:
continue
for rank, result in enumerate(results, start=1):
path = result.path
rrf_contribution = weight / (k + rank)
# Initialize or accumulate fusion score
if path not in path_to_fusion_score:
path_to_fusion_score[path] = 0.0
path_to_result[path] = result
path_to_fusion_score[path] += rrf_contribution
# Create final results with fusion scores
fused_results = []
for path, base_result in path_to_result.items():
fusion_score = path_to_fusion_score[path]
# Create new SearchResult with fusion_score in metadata
fused_result = SearchResult(
path=base_result.path,
score=fusion_score,
excerpt=base_result.excerpt,
content=base_result.content,
symbol=base_result.symbol,
chunk=base_result.chunk,
metadata={
**base_result.metadata,
"fusion_score": fusion_score,
"original_score": base_result.score,
},
start_line=base_result.start_line,
end_line=base_result.end_line,
symbol_name=base_result.symbol_name,
symbol_kind=base_result.symbol_kind,
)
fused_results.append(fused_result)
# Sort by fusion score descending
fused_results.sort(key=lambda r: r.score, reverse=True)
return fused_results
def normalize_bm25_score(score: float) -> float:
"""Normalize BM25 scores from SQLite FTS5 to 0-1 range.
SQLite FTS5 returns negative BM25 scores (more negative = better match).
Uses sigmoid transformation for normalization.
Args:
score: Raw BM25 score from SQLite (typically negative)
Returns:
Normalized score in range [0, 1]
Examples:
>>> normalize_bm25_score(-10.5) # Good match
0.85
>>> normalize_bm25_score(-1.2) # Weak match
0.62
"""
# Take absolute value (BM25 is negative in SQLite)
abs_score = abs(score)
# Sigmoid transformation: 1 / (1 + e^(-x))
# Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1)
normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1))
return normalized
def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]:
"""Tag search results with their source for RRF tracking.
Args:
results: List of SearchResult objects
source: Source identifier ('exact', 'fuzzy', 'vector')
Returns:
List of SearchResult objects with 'search_source' in metadata
"""
tagged_results = []
for result in results:
tagged_result = SearchResult(
path=result.path,
score=result.score,
excerpt=result.excerpt,
content=result.content,
symbol=result.symbol,
chunk=result.chunk,
metadata={**result.metadata, "search_source": source},
start_line=result.start_line,
end_line=result.end_line,
symbol_name=result.symbol_name,
symbol_kind=result.symbol_kind,
)
tagged_results.append(tagged_result)
return tagged_results

View File

@@ -57,7 +57,7 @@ class DirIndexStore:
# Schema version for migration tracking
# Increment this when schema changes require migration
SCHEMA_VERSION = 2
SCHEMA_VERSION = 4
def __init__(self, db_path: str | Path) -> None:
"""Initialize directory index store.
@@ -93,11 +93,13 @@ class DirIndexStore:
)
# Create or migrate schema
self._create_schema(conn)
self._create_fts_triggers(conn)
# Apply versioned migrations if needed
if current_version < self.SCHEMA_VERSION:
if current_version == 0:
# New database - create schema directly
self._create_schema(conn)
self._create_fts_triggers(conn)
self._set_schema_version(conn, self.SCHEMA_VERSION)
elif current_version < self.SCHEMA_VERSION:
# Existing database - apply migrations
self._apply_migrations(conn, current_version)
self._set_schema_version(conn, self.SCHEMA_VERSION)
@@ -126,6 +128,11 @@ class DirIndexStore:
if from_version < 2:
self._migrate_v2_add_name_column(conn)
# Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
if from_version < 4:
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
upgrade(conn)
def close(self) -> None:
"""Close database connection."""
with self._lock:
@@ -465,6 +472,117 @@ class DirIndexStore:
return float(row["mtime"]) if row and row["mtime"] else None
def needs_reindex(self, full_path: str | Path) -> bool:
"""Check if a file needs reindexing based on mtime comparison.
Uses 1ms tolerance to handle filesystem timestamp precision variations.
Args:
full_path: Complete source file path
Returns:
True if file should be reindexed (new, modified, or missing from index)
"""
full_path_obj = Path(full_path).resolve()
if not full_path_obj.exists():
return False # File doesn't exist, skip indexing
# Get current filesystem mtime
try:
current_mtime = full_path_obj.stat().st_mtime
except OSError:
return False # Can't read file stats, skip
# Get stored mtime from database
stored_mtime = self.get_file_mtime(full_path_obj)
# File not in index, needs indexing
if stored_mtime is None:
return True
# Compare with 1ms tolerance for floating point precision
MTIME_TOLERANCE = 0.001
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
def add_file_incremental(
self,
name: str,
full_path: str | Path,
content: str,
language: str,
symbols: Optional[List[Symbol]] = None,
) -> Optional[int]:
"""Add or update a file only if it has changed (incremental indexing).
Checks mtime before indexing to skip unchanged files.
Args:
name: Filename without path
full_path: Complete source file path
content: File content for indexing
language: Programming language identifier
symbols: List of Symbol objects from the file
Returns:
Database file_id if indexed, None if skipped (unchanged)
Raises:
StorageError: If database operations fail
"""
# Check if reindexing is needed
if not self.needs_reindex(full_path):
return None # Skip unchanged file
# File changed or new, perform full indexing
return self.add_file(name, full_path, content, language, symbols)
def cleanup_deleted_files(self, source_dir: Path) -> int:
"""Remove indexed files that no longer exist in the source directory.
Scans the source directory and removes database entries for deleted files.
Args:
source_dir: Source directory to scan
Returns:
Number of deleted file entries removed
Raises:
StorageError: If cleanup operations fail
"""
with self._lock:
conn = self._get_connection()
source_dir = source_dir.resolve()
try:
# Get all indexed file paths
rows = conn.execute("SELECT full_path FROM files").fetchall()
indexed_paths = {row["full_path"] for row in rows}
# Build set of existing files in source directory
existing_paths = set()
for file_path in source_dir.rglob("*"):
if file_path.is_file():
existing_paths.add(str(file_path.resolve()))
# Find orphaned entries (indexed but no longer exist)
deleted_paths = indexed_paths - existing_paths
# Remove orphaned entries
deleted_count = 0
for deleted_path in deleted_paths:
conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
deleted_count += 1
if deleted_count > 0:
conn.commit()
return deleted_count
except Exception as exc:
conn.rollback()
raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
def list_files(self) -> List[FileEntry]:
"""List all files in current directory.
@@ -985,6 +1103,92 @@ class DirIndexStore:
)
return results
def search_fts_exact(self, query: str, limit: int = 20) -> List[SearchResult]:
"""Full-text search using exact token matching (unicode61 tokenizer).
Args:
query: FTS5 query string
limit: Maximum results to return
Returns:
List of SearchResult objects sorted by relevance
Raises:
StorageError: If FTS search fails
"""
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts_exact
WHERE files_fts_exact MATCH ?
ORDER BY rank
LIMIT ?
""",
(query, limit),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS exact search failed: {exc}") from exc
results: List[SearchResult] = []
for row in rows:
rank = float(row["rank"]) if row["rank"] is not None else 0.0
score = abs(rank) if rank < 0 else 0.0
results.append(
SearchResult(
path=row["full_path"],
score=score,
excerpt=row["excerpt"],
)
)
return results
def search_fts_fuzzy(self, query: str, limit: int = 20) -> List[SearchResult]:
"""Full-text search using fuzzy/substring matching (trigram or extended unicode61 tokenizer).
Args:
query: FTS5 query string
limit: Maximum results to return
Returns:
List of SearchResult objects sorted by relevance
Raises:
StorageError: If FTS search fails
"""
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT rowid, full_path, bm25(files_fts_fuzzy) AS rank,
snippet(files_fts_fuzzy, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts_fuzzy
WHERE files_fts_fuzzy MATCH ?
ORDER BY rank
LIMIT ?
""",
(query, limit),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS fuzzy search failed: {exc}") from exc
results: List[SearchResult] = []
for row in rows:
rank = float(row["rank"]) if row["rank"] is not None else 0.0
score = abs(rank) if rank < 0 else 0.0
results.append(
SearchResult(
path=row["full_path"],
score=score,
excerpt=row["excerpt"],
)
)
return results
def search_files_only(self, query: str, limit: int = 20) -> List[str]:
"""Fast FTS search returning only file paths (no snippet generation).
@@ -1185,16 +1389,34 @@ class DirIndexStore:
"""
)
# FTS5 external content table with code-friendly tokenizer
# unicode61 tokenchars keeps underscores as part of tokens
# so 'user_id' is indexed as one token, not 'user' and 'id'
# Dual FTS5 external content tables for exact and fuzzy matching
# files_fts_exact: unicode61 tokenizer for exact token matching
# files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
from codexlens.storage.sqlite_utils import check_trigram_support
has_trigram = check_trigram_support(conn)
fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
# Exact FTS table with unicode61 tokenizer
conn.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="unicode61 tokenchars '_'"
tokenize="unicode61 tokenchars '_-'"
)
"""
)
# Fuzzy FTS table with trigram or extended unicode61 tokenizer
conn.execute(
f"""
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="{fuzzy_tokenizer}"
)
"""
)
@@ -1301,38 +1523,72 @@ class DirIndexStore:
conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
"""Create FTS5 external content triggers.
"""Create FTS5 external content triggers for dual FTS tables.
Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.
Args:
conn: Database connection
"""
# Insert trigger
# Insert triggers for files_fts_exact
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts(rowid, name, full_path, content)
CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Delete trigger
# Delete trigger for files_fts_exact
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
# Update trigger
# Update trigger for files_fts_exact
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts(rowid, name, full_path, content)
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Insert trigger for files_fts_fuzzy
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Delete trigger for files_fts_fuzzy
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
# Update trigger for files_fts_fuzzy
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""

View File

@@ -77,7 +77,7 @@ class IndexTreeBuilder:
}
def __init__(
self, registry: RegistryStore, mapper: PathMapper, config: Config = None
self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
):
"""Initialize the index tree builder.
@@ -85,18 +85,21 @@ class IndexTreeBuilder:
registry: Global registry store for project tracking
mapper: Path mapper for source to index conversions
config: CodexLens configuration (uses defaults if None)
incremental: Enable incremental indexing (default True)
"""
self.registry = registry
self.mapper = mapper
self.config = config or Config()
self.parser_factory = ParserFactory(self.config)
self.logger = logging.getLogger(__name__)
self.incremental = incremental
def build(
self,
source_root: Path,
languages: List[str] = None,
workers: int = 4,
force_full: bool = False,
) -> BuildResult:
"""Build complete index tree for a project.
@@ -106,11 +109,13 @@ class IndexTreeBuilder:
3. Build indexes bottom-up (deepest first)
4. Link subdirectories to parents
5. Update project statistics
6. Cleanup deleted files (if incremental mode)
Args:
source_root: Project root directory to index
languages: Optional list of language IDs to limit indexing
workers: Number of parallel worker processes
force_full: Force full reindex (override incremental mode)
Returns:
BuildResult with statistics and errors
@@ -122,7 +127,12 @@ class IndexTreeBuilder:
if not source_root.exists():
raise ValueError(f"Source root does not exist: {source_root}")
self.logger.info("Building index tree for %s", source_root)
# Override incremental mode if force_full is True
use_incremental = self.incremental and not force_full
if force_full:
self.logger.info("Building index tree for %s (FULL reindex)", source_root)
else:
self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)
# Register project
index_root = self.mapper.source_to_index_dir(source_root)
@@ -186,6 +196,25 @@ class IndexTreeBuilder:
# Link children to this directory
self._link_children_to_parent(result.source_path, all_results)
# Cleanup deleted files if in incremental mode
if use_incremental:
self.logger.info("Cleaning up deleted files...")
total_deleted = 0
for result in all_results:
if result.error:
continue
try:
with DirIndexStore(result.index_path) as store:
deleted_count = store.cleanup_deleted_files(result.source_path)
total_deleted += deleted_count
if deleted_count > 0:
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
except Exception as exc:
self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
if total_deleted > 0:
self.logger.info("Removed %d deleted files from index", total_deleted)
# Update project statistics
self.registry.update_project_stats(source_root, total_files, total_dirs)
@@ -436,9 +465,15 @@ class IndexTreeBuilder:
files_count = 0
symbols_count = 0
skipped_count = 0
for file_path in source_files:
try:
# Check if file needs reindexing (incremental mode)
if self.incremental and not store.needs_reindex(file_path):
skipped_count += 1
continue
# Read and parse file
text = file_path.read_text(encoding="utf-8", errors="ignore")
language_id = self.config.language_for_path(file_path)
@@ -491,13 +526,23 @@ class IndexTreeBuilder:
store.close()
self.logger.debug(
"Built %s: %d files, %d symbols, %d subdirs",
dir_path,
files_count,
symbols_count,
len(subdirs),
)
if skipped_count > 0:
self.logger.debug(
"Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
dir_path,
files_count,
skipped_count,
symbols_count,
len(subdirs),
)
else:
self.logger.debug(
"Built %s: %d files, %d symbols, %d subdirs",
dir_path,
files_count,
symbols_count,
len(subdirs),
)
return DirBuildResult(
source_path=dir_path,

View File

@@ -0,0 +1,231 @@
"""
Migration 004: Add dual FTS tables for exact and fuzzy matching.
This migration introduces two FTS5 tables:
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
Both tables are synchronized with the files table via triggers for automatic updates.
"""
import logging
from sqlite3 import Connection
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add dual FTS tables.
- Drops old files_fts table and triggers
- Creates files_fts_exact with unicode61 tokenizer
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
- Creates synchronized triggers for both tables
- Rebuilds FTS indexes from files table
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
try:
# Check trigram support
has_trigram = check_trigram_support(db_conn)
version = get_sqlite_version(db_conn)
log.info(f"SQLite version: {'.'.join(map(str, version))}")
if has_trigram:
log.info("Trigram tokenizer available, using for fuzzy FTS table")
fuzzy_tokenizer = "trigram"
else:
log.warning(
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
f"using extended unicode61 tokenizer for fuzzy matching"
)
fuzzy_tokenizer = "unicode61 tokenchars '_-'"
# Start transaction
cursor.execute("BEGIN TRANSACTION")
# Check if files table has 'name' column (v2 schema doesn't have it)
cursor.execute("PRAGMA table_info(files)")
columns = {row[1] for row in cursor.fetchall()}
if 'name' not in columns:
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
# Add name column
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
# Populate name from path (extract filename from last '/')
# Use Python to do the extraction since SQLite doesn't have reverse()
cursor.execute("SELECT rowid, path FROM files")
rows = cursor.fetchall()
for rowid, path in rows:
# Extract filename from path
name = path.split('/')[-1] if '/' in path else path
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
# Rename 'path' column to 'full_path' if needed
if 'path' in columns and 'full_path' not in columns:
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
# Check if indexed_at column exists in v2 schema
has_indexed_at = 'indexed_at' in columns
has_mtime = 'mtime' in columns
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
cursor.execute("""
CREATE TABLE files_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
full_path TEXT NOT NULL UNIQUE,
content TEXT,
language TEXT,
mtime REAL,
indexed_at TEXT
)
""")
# Build INSERT statement based on available columns
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
if has_indexed_at and has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
SELECT name, path, content, language, mtime, indexed_at FROM files
""")
elif has_indexed_at:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, indexed_at)
SELECT name, path, content, language, indexed_at FROM files
""")
elif has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime)
SELECT name, path, content, language, mtime FROM files
""")
else:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language)
SELECT name, path, content, language FROM files
""")
cursor.execute("DROP TABLE files")
cursor.execute("ALTER TABLE files_new RENAME TO files")
log.info("Dropping old FTS triggers and table...")
# Drop old triggers
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
cursor.execute("DROP TRIGGER IF EXISTS files_au")
# Drop old FTS table
cursor.execute("DROP TABLE IF EXISTS files_fts")
# Create exact FTS table (unicode61 with underscores/hyphens as token chars)
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
cursor.execute(
"""
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="unicode61 tokenchars '_-'"
)
"""
)
# Create fuzzy FTS table (trigram or extended unicode61)
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
cursor.execute(
f"""
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="{fuzzy_tokenizer}"
)
"""
)
# Create synchronized triggers for files_fts_exact
log.info("Creating triggers for files_fts_exact...")
cursor.execute(
"""
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Create synchronized triggers for files_fts_fuzzy
log.info("Creating triggers for files_fts_fuzzy...")
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Rebuild FTS indexes from files table
log.info("Rebuilding FTS indexes from files table...")
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
# Commit transaction
cursor.execute("COMMIT")
log.info("Migration 004 completed successfully")
# Vacuum to reclaim space (outside transaction)
try:
log.info("Running VACUUM to reclaim space...")
cursor.execute("VACUUM")
except Exception as e:
log.warning(f"VACUUM failed (non-critical): {e}")
except Exception as e:
log.error(f"Migration 004 failed: {e}")
try:
cursor.execute("ROLLBACK")
except Exception:
pass
raise

View File

@@ -0,0 +1,64 @@
"""SQLite utility functions for CodexLens storage layer."""
from __future__ import annotations
import logging
import sqlite3
log = logging.getLogger(__name__)
def check_trigram_support(conn: sqlite3.Connection) -> bool:
"""Check if SQLite supports trigram tokenizer for FTS5.
Trigram tokenizer requires SQLite >= 3.34.0.
Args:
conn: Database connection to test
Returns:
True if trigram tokenizer is available, False otherwise
"""
try:
# Test by creating a temporary virtual table with trigram tokenizer
conn.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
USING fts5(test_content, tokenize='trigram')
"""
)
# Clean up test table
conn.execute("DROP TABLE IF EXISTS test_trigram_check")
conn.commit()
return True
except sqlite3.OperationalError as e:
# Trigram tokenizer not available
if "unrecognized tokenizer" in str(e).lower():
log.debug("Trigram tokenizer not available in this SQLite version")
return False
# Other operational errors should be re-raised
raise
except Exception:
# Any other exception means trigram is not supported
return False
def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
"""Get SQLite version as (major, minor, patch) tuple.
Args:
conn: Database connection
Returns:
Version tuple, e.g., (3, 34, 1)
"""
row = conn.execute("SELECT sqlite_version()").fetchone()
version_str = row[0] if row else "0.0.0"
parts = version_str.split('.')
try:
major = int(parts[0]) if len(parts) > 0 else 0
minor = int(parts[1]) if len(parts) > 1 else 0
patch = int(parts[2]) if len(parts) > 2 else 0
return (major, minor, patch)
except (ValueError, IndexError):
return (0, 0, 0)