mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
Add comprehensive tests for query parsing and Reciprocal Rank Fusion
- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
@@ -20,6 +20,7 @@ from codexlens.parsers.factory import ParserFactory
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore, ProjectInfo
|
||||
from codexlens.storage.index_tree import IndexTreeBuilder
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
|
||||
from .output import (
|
||||
@@ -77,6 +78,7 @@ def init(
|
||||
help="Limit indexing to specific languages (repeat or comma-separated).",
|
||||
),
|
||||
workers: int = typer.Option(4, "--workers", "-w", min=1, max=16, help="Parallel worker processes."),
|
||||
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||
) -> None:
|
||||
@@ -84,6 +86,9 @@ def init(
|
||||
|
||||
Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure.
|
||||
Set CODEXLENS_INDEX_DIR to customize the index location.
|
||||
|
||||
By default, uses incremental indexing (skip unchanged files).
|
||||
Use --force to rebuild all files regardless of modification time.
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
config = Config()
|
||||
@@ -96,14 +101,18 @@ def init(
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
|
||||
builder = IndexTreeBuilder(registry, mapper, config)
|
||||
builder = IndexTreeBuilder(registry, mapper, config, incremental=not force)
|
||||
|
||||
console.print(f"[bold]Building index for:[/bold] {base_path}")
|
||||
if force:
|
||||
console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]")
|
||||
else:
|
||||
console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]")
|
||||
|
||||
build_result = builder.build(
|
||||
source_root=base_path,
|
||||
languages=languages,
|
||||
workers=workers,
|
||||
force_full=force,
|
||||
)
|
||||
|
||||
result = {
|
||||
@@ -172,6 +181,8 @@ def search(
|
||||
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
|
||||
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
|
||||
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
|
||||
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
|
||||
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||
) -> None:
|
||||
@@ -179,10 +190,51 @@ def search(
|
||||
|
||||
Uses chain search across directory indexes.
|
||||
Use --depth to limit search recursion (0 = current dir only).
|
||||
|
||||
Search Modes:
|
||||
- exact: Exact FTS using unicode61 tokenizer (default)
|
||||
- fuzzy: Fuzzy FTS using trigram tokenizer
|
||||
- hybrid: RRF fusion of exact + fuzzy (recommended)
|
||||
- vector: Semantic vector search (future)
|
||||
|
||||
Hybrid Mode:
|
||||
Default weights: exact=0.4, fuzzy=0.3, vector=0.3
|
||||
Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
search_path = path.expanduser().resolve()
|
||||
|
||||
# Validate mode
|
||||
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
|
||||
if mode not in valid_modes:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
|
||||
else:
|
||||
console.print(f"[red]Invalid mode:[/red] {mode}")
|
||||
console.print(f"[dim]Valid modes: {', '.join(valid_modes)}[/dim]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Parse custom weights if provided
|
||||
hybrid_weights = None
|
||||
if weights:
|
||||
try:
|
||||
weight_parts = [float(w.strip()) for w in weights.split(",")]
|
||||
if len(weight_parts) == 3:
|
||||
weight_sum = sum(weight_parts)
|
||||
if abs(weight_sum - 1.0) > 0.01:
|
||||
console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
|
||||
# Normalize weights
|
||||
weight_parts = [w / weight_sum for w in weight_parts]
|
||||
hybrid_weights = {
|
||||
"exact": weight_parts[0],
|
||||
"fuzzy": weight_parts[1],
|
||||
"vector": weight_parts[2],
|
||||
}
|
||||
else:
|
||||
console.print("[yellow]Warning: Invalid weights format (need 3 values). Using defaults.[/yellow]")
|
||||
except ValueError:
|
||||
console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
|
||||
|
||||
registry: RegistryStore | None = None
|
||||
try:
|
||||
registry = RegistryStore()
|
||||
@@ -190,10 +242,18 @@ def search(
|
||||
mapper = PathMapper()
|
||||
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
# Map mode to options
|
||||
hybrid_mode = mode == "hybrid"
|
||||
enable_fuzzy = mode in ["fuzzy", "hybrid"]
|
||||
|
||||
options = SearchOptions(
|
||||
depth=depth,
|
||||
total_limit=limit,
|
||||
files_only=files_only,
|
||||
hybrid_mode=hybrid_mode,
|
||||
enable_fuzzy=enable_fuzzy,
|
||||
hybrid_weights=hybrid_weights,
|
||||
)
|
||||
|
||||
if files_only:
|
||||
@@ -208,8 +268,17 @@ def search(
|
||||
result = engine.search(query, search_path, options)
|
||||
payload = {
|
||||
"query": query,
|
||||
"mode": mode,
|
||||
"count": len(result.results),
|
||||
"results": [{"path": r.path, "score": r.score, "excerpt": r.excerpt} for r in result.results],
|
||||
"results": [
|
||||
{
|
||||
"path": r.path,
|
||||
"score": r.score,
|
||||
"excerpt": r.excerpt,
|
||||
"source": getattr(r, "search_source", None),
|
||||
}
|
||||
for r in result.results
|
||||
],
|
||||
"stats": {
|
||||
"dirs_searched": result.stats.dirs_searched,
|
||||
"files_matched": result.stats.files_matched,
|
||||
@@ -219,9 +288,8 @@ def search(
|
||||
if json_mode:
|
||||
print_json(success=True, result=payload)
|
||||
else:
|
||||
render_search_results(result.results)
|
||||
if verbose:
|
||||
console.print(f"[dim]Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
|
||||
render_search_results(result.results, verbose=verbose)
|
||||
console.print(f"[dim]Mode: {mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
|
||||
|
||||
except SearchError as exc:
|
||||
if json_mode:
|
||||
@@ -404,6 +472,27 @@ def status(
|
||||
if f.is_file():
|
||||
index_size += f.stat().st_size
|
||||
|
||||
# Check schema version and enabled features
|
||||
schema_version = None
|
||||
has_dual_fts = False
|
||||
if projects and index_root.exists():
|
||||
# Check first index database for features
|
||||
index_files = list(index_root.rglob("_index.db"))
|
||||
if index_files:
|
||||
try:
|
||||
with DirIndexStore(index_files[0]) as store:
|
||||
with store._lock:
|
||||
conn = store._get_connection()
|
||||
schema_version = store._get_schema_version(conn)
|
||||
# Check if dual FTS tables exist
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name IN ('search_fts_exact', 'search_fts_fuzzy')"
|
||||
)
|
||||
fts_tables = [row[0] for row in cursor.fetchall()]
|
||||
has_dual_fts = len(fts_tables) == 2
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stats = {
|
||||
"index_root": str(index_root),
|
||||
"registry_path": str(_get_registry_path()),
|
||||
@@ -412,6 +501,13 @@ def status(
|
||||
"total_dirs": total_dirs,
|
||||
"index_size_bytes": index_size,
|
||||
"index_size_mb": round(index_size / (1024 * 1024), 2),
|
||||
"schema_version": schema_version,
|
||||
"features": {
|
||||
"exact_fts": True, # Always available
|
||||
"fuzzy_fts": has_dual_fts,
|
||||
"hybrid_search": has_dual_fts,
|
||||
"vector_search": False, # Not yet implemented
|
||||
},
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
@@ -424,6 +520,17 @@ def status(
|
||||
console.print(f" Total Files: {stats['total_files']}")
|
||||
console.print(f" Total Directories: {stats['total_dirs']}")
|
||||
console.print(f" Index Size: {stats['index_size_mb']} MB")
|
||||
if schema_version:
|
||||
console.print(f" Schema Version: {schema_version}")
|
||||
console.print("\n[bold]Search Backends:[/bold]")
|
||||
console.print(f" Exact FTS: ✓ (unicode61)")
|
||||
if has_dual_fts:
|
||||
console.print(f" Fuzzy FTS: ✓ (trigram)")
|
||||
console.print(f" Hybrid Search: ✓ (RRF fusion)")
|
||||
else:
|
||||
console.print(f" Fuzzy FTS: ✗ (run 'migrate' to enable)")
|
||||
console.print(f" Hybrid Search: ✗ (run 'migrate' to enable)")
|
||||
console.print(f" Vector Search: ✗ (future)")
|
||||
|
||||
except StorageError as exc:
|
||||
if json_mode:
|
||||
@@ -778,6 +885,139 @@ def config(
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def migrate(
|
||||
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||
) -> None:
|
||||
"""Migrate project indexes to latest schema (Dual-FTS upgrade).
|
||||
|
||||
Upgrades all _index.db files in the project to schema version 4, which includes:
|
||||
- Dual FTS tables (exact + fuzzy)
|
||||
- Encoding detection support
|
||||
- Incremental indexing metadata
|
||||
|
||||
This is a safe operation that preserves all existing data.
|
||||
Progress is shown during migration.
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
base_path = path.expanduser().resolve()
|
||||
|
||||
registry: RegistryStore | None = None
|
||||
try:
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
|
||||
# Find project
|
||||
project_info = registry.get_project(base_path)
|
||||
if not project_info:
|
||||
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
|
||||
|
||||
index_dir = mapper.source_to_index_dir(base_path)
|
||||
if not index_dir.exists():
|
||||
raise CodexLensError(f"Index directory not found: {index_dir}")
|
||||
|
||||
# Find all _index.db files
|
||||
index_files = list(index_dir.rglob("_index.db"))
|
||||
|
||||
if not index_files:
|
||||
if json_mode:
|
||||
print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0})
|
||||
else:
|
||||
console.print("[yellow]No indexes found to migrate.[/yellow]")
|
||||
return
|
||||
|
||||
migrated_count = 0
|
||||
error_count = 0
|
||||
already_migrated = 0
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||
TextColumn("({task.completed}/{task.total})"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files))
|
||||
|
||||
for db_path in index_files:
|
||||
try:
|
||||
store = DirIndexStore(db_path)
|
||||
|
||||
# Check current version
|
||||
with store._lock:
|
||||
conn = store._get_connection()
|
||||
current_version = store._get_schema_version(conn)
|
||||
|
||||
if current_version >= DirIndexStore.SCHEMA_VERSION:
|
||||
already_migrated += 1
|
||||
if verbose:
|
||||
progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]")
|
||||
elif current_version > 0:
|
||||
# Apply migrations
|
||||
store._apply_migrations(conn, current_version)
|
||||
store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION)
|
||||
conn.commit()
|
||||
migrated_count += 1
|
||||
if verbose:
|
||||
progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]")
|
||||
else:
|
||||
# New database, initialize directly
|
||||
store.initialize()
|
||||
migrated_count += 1
|
||||
|
||||
store.close()
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
if verbose:
|
||||
progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]")
|
||||
|
||||
progress.update(task, advance=1)
|
||||
|
||||
result = {
|
||||
"path": str(base_path),
|
||||
"total_indexes": len(index_files),
|
||||
"migrated": migrated_count,
|
||||
"already_migrated": already_migrated,
|
||||
"errors": error_count,
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result=result)
|
||||
else:
|
||||
console.print(f"[green]Migration complete:[/green]")
|
||||
console.print(f" Total indexes: {len(index_files)}")
|
||||
console.print(f" Migrated: {migrated_count}")
|
||||
console.print(f" Already up-to-date: {already_migrated}")
|
||||
if error_count > 0:
|
||||
console.print(f" [yellow]Errors: {error_count}[/yellow]")
|
||||
|
||||
except StorageError as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Storage error: {exc}")
|
||||
else:
|
||||
console.print(f"[red]Migration failed (storage):[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
except CodexLensError as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Migration failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Unexpected error: {exc}")
|
||||
else:
|
||||
console.print(f"[red]Migration failed (unexpected):[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
finally:
|
||||
if registry is not None:
|
||||
registry.close()
|
||||
|
||||
|
||||
@app.command()
|
||||
|
||||
@@ -41,15 +41,45 @@ def print_json(*, success: bool, result: Any = None, error: str | None = None) -
|
||||
console.print_json(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
|
||||
def render_search_results(results: Sequence[SearchResult], *, title: str = "Search Results") -> None:
|
||||
def render_search_results(
|
||||
results: Sequence[SearchResult], *, title: str = "Search Results", verbose: bool = False
|
||||
) -> None:
|
||||
"""Render search results with optional source tags in verbose mode.
|
||||
|
||||
Args:
|
||||
results: Search results to display
|
||||
title: Table title
|
||||
verbose: If True, show search source tags ([E], [F], [V]) and fusion scores
|
||||
"""
|
||||
table = Table(title=title, show_lines=False)
|
||||
|
||||
if verbose:
|
||||
# Verbose mode: show source tags
|
||||
table.add_column("Source", style="dim", width=6, justify="center")
|
||||
|
||||
table.add_column("Path", style="cyan", no_wrap=True)
|
||||
table.add_column("Score", style="magenta", justify="right")
|
||||
table.add_column("Excerpt", style="white")
|
||||
|
||||
for res in results:
|
||||
excerpt = res.excerpt or ""
|
||||
table.add_row(res.path, f"{res.score:.3f}", excerpt)
|
||||
score_str = f"{res.score:.3f}"
|
||||
|
||||
if verbose:
|
||||
# Extract search source tag if available
|
||||
source = getattr(res, "search_source", None)
|
||||
source_tag = ""
|
||||
if source == "exact":
|
||||
source_tag = "[E]"
|
||||
elif source == "fuzzy":
|
||||
source_tag = "[F]"
|
||||
elif source == "vector":
|
||||
source_tag = "[V]"
|
||||
elif source == "fusion":
|
||||
source_tag = "[RRF]"
|
||||
table.add_row(source_tag, res.path, score_str, excerpt)
|
||||
else:
|
||||
table.add_row(res.path, score_str, excerpt)
|
||||
|
||||
console.print(table)
|
||||
|
||||
|
||||
202
codex-lens/src/codexlens/parsers/encoding.py
Normal file
202
codex-lens/src/codexlens/parsers/encoding.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Optional encoding detection module for CodexLens.
|
||||
|
||||
Provides automatic encoding detection with graceful fallback to UTF-8.
|
||||
Install with: pip install codexlens[encoding]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Feature flag for encoding detection availability
|
||||
ENCODING_DETECTION_AVAILABLE = False
|
||||
_import_error: Optional[str] = None
|
||||
|
||||
|
||||
def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
|
||||
"""Detect if chardet or charset-normalizer is available."""
|
||||
try:
|
||||
import chardet
|
||||
return True, None
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
return True, None
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return False, "chardet not available. Install with: pip install codexlens[encoding]"
|
||||
|
||||
|
||||
# Initialize on module load
|
||||
ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
|
||||
|
||||
|
||||
def check_encoding_available() -> Tuple[bool, Optional[str]]:
|
||||
"""Check if encoding detection dependencies are available.
|
||||
|
||||
Returns:
|
||||
Tuple of (available, error_message)
|
||||
"""
|
||||
return ENCODING_DETECTION_AVAILABLE, _import_error
|
||||
|
||||
|
||||
def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
|
||||
"""Detect encoding from file content bytes.
|
||||
|
||||
Uses chardet or charset-normalizer with configurable confidence threshold.
|
||||
Falls back to UTF-8 if confidence is too low or detection unavailable.
|
||||
|
||||
Args:
|
||||
content_bytes: Raw file content as bytes
|
||||
confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
|
||||
|
||||
Returns:
|
||||
Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
|
||||
Returns 'utf-8' as fallback if detection fails or confidence too low
|
||||
"""
|
||||
if not ENCODING_DETECTION_AVAILABLE:
|
||||
log.debug("Encoding detection not available, using UTF-8 fallback")
|
||||
return "utf-8"
|
||||
|
||||
if not content_bytes:
|
||||
return "utf-8"
|
||||
|
||||
try:
|
||||
# Try chardet first
|
||||
try:
|
||||
import chardet
|
||||
result = chardet.detect(content_bytes)
|
||||
encoding = result.get("encoding")
|
||||
confidence = result.get("confidence", 0.0)
|
||||
|
||||
if encoding and confidence >= confidence_threshold:
|
||||
log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
|
||||
# Normalize encoding name: replace underscores with hyphens
|
||||
return encoding.lower().replace('_', '-')
|
||||
else:
|
||||
log.debug(
|
||||
f"Low confidence encoding detection: {encoding} "
|
||||
f"(confidence: {confidence:.2f}), using UTF-8 fallback"
|
||||
)
|
||||
return "utf-8"
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback to charset-normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
results = from_bytes(content_bytes)
|
||||
if results:
|
||||
best = results.best()
|
||||
if best and best.encoding:
|
||||
log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
|
||||
# Normalize encoding name: replace underscores with hyphens
|
||||
return best.encoding.lower().replace('_', '-')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
|
||||
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def read_file_safe(
|
||||
path: Path | str,
|
||||
confidence_threshold: float = 0.7,
|
||||
max_detection_bytes: int = 100_000
|
||||
) -> Tuple[str, str]:
|
||||
"""Read file with automatic encoding detection and safe decoding.
|
||||
|
||||
Reads file bytes, detects encoding, and decodes with error replacement
|
||||
to preserve file structure even with encoding issues.
|
||||
|
||||
Args:
|
||||
path: Path to file to read
|
||||
confidence_threshold: Minimum confidence for encoding detection
|
||||
max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
|
||||
|
||||
Returns:
|
||||
Tuple of (content, detected_encoding)
|
||||
- content: Decoded file content (with <20> for unmappable bytes)
|
||||
- detected_encoding: Detected encoding name
|
||||
|
||||
Raises:
|
||||
OSError: If file cannot be read
|
||||
IsADirectoryError: If path is a directory
|
||||
"""
|
||||
file_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
# Read file bytes
|
||||
try:
|
||||
content_bytes = file_path.read_bytes()
|
||||
except Exception as e:
|
||||
log.error(f"Failed to read file {file_path}: {e}")
|
||||
raise
|
||||
|
||||
# Detect encoding from first N bytes for performance
|
||||
detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
|
||||
encoding = detect_encoding(detection_sample, confidence_threshold)
|
||||
|
||||
# Decode with error replacement to preserve structure
|
||||
try:
|
||||
content = content_bytes.decode(encoding, errors='replace')
|
||||
log.debug(f"Successfully decoded {file_path} using {encoding}")
|
||||
return content, encoding
|
||||
except Exception as e:
|
||||
# Final fallback to UTF-8 with replacement
|
||||
log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
|
||||
content = content_bytes.decode('utf-8', errors='replace')
|
||||
return content, 'utf-8'
|
||||
|
||||
|
||||
def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
|
||||
"""Check if file is likely binary by sampling first bytes.
|
||||
|
||||
Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
|
||||
|
||||
Args:
|
||||
path: Path to file to check
|
||||
sample_size: Number of bytes to sample (default 8KB)
|
||||
|
||||
Returns:
|
||||
True if file appears to be binary, False otherwise
|
||||
"""
|
||||
file_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
try:
|
||||
with file_path.open('rb') as f:
|
||||
sample = f.read(sample_size)
|
||||
|
||||
if not sample:
|
||||
return False
|
||||
|
||||
# Count null bytes and non-printable characters
|
||||
null_count = sample.count(b'\x00')
|
||||
non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
|
||||
|
||||
# If >30% null bytes or >50% non-text, consider binary
|
||||
null_ratio = null_count / len(sample)
|
||||
non_text_ratio = non_text_count / len(sample)
|
||||
|
||||
return null_ratio > 0.3 or non_text_ratio > 0.5
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
|
||||
return False
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ENCODING_DETECTION_AVAILABLE",
|
||||
"check_encoding_available",
|
||||
"detect_encoding",
|
||||
"read_file_safe",
|
||||
"is_binary_file",
|
||||
]
|
||||
@@ -18,6 +18,7 @@ from codexlens.storage.registry import RegistryStore, DirMapping
|
||||
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.sqlite_store import SQLiteStore
|
||||
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,6 +33,9 @@ class SearchOptions:
|
||||
include_symbols: Whether to include symbol search results
|
||||
files_only: Return only file paths without excerpts
|
||||
include_semantic: Whether to include semantic keyword search results
|
||||
hybrid_mode: Enable hybrid search with RRF fusion (default False)
|
||||
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
|
||||
hybrid_weights: Custom RRF weights for hybrid search (optional)
|
||||
"""
|
||||
depth: int = -1
|
||||
max_workers: int = 8
|
||||
@@ -40,6 +44,9 @@ class SearchOptions:
|
||||
include_symbols: bool = False
|
||||
files_only: bool = False
|
||||
include_semantic: bool = False
|
||||
hybrid_mode: bool = False
|
||||
enable_fuzzy: bool = True
|
||||
hybrid_weights: Optional[Dict[str, float]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -484,7 +491,10 @@ class ChainSearchEngine:
|
||||
query,
|
||||
options.limit_per_dir,
|
||||
options.files_only,
|
||||
options.include_semantic
|
||||
options.include_semantic,
|
||||
options.hybrid_mode,
|
||||
options.enable_fuzzy,
|
||||
options.hybrid_weights
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
}
|
||||
@@ -507,7 +517,10 @@ class ChainSearchEngine:
|
||||
query: str,
|
||||
limit: int,
|
||||
files_only: bool = False,
|
||||
include_semantic: bool = False) -> List[SearchResult]:
|
||||
include_semantic: bool = False,
|
||||
hybrid_mode: bool = False,
|
||||
enable_fuzzy: bool = True,
|
||||
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
|
||||
"""Search a single index database.
|
||||
|
||||
Handles exceptions gracefully, returning empty list on failure.
|
||||
@@ -518,39 +531,54 @@ class ChainSearchEngine:
|
||||
limit: Maximum results from this index
|
||||
files_only: If True, skip snippet generation for faster search
|
||||
include_semantic: If True, also search semantic keywords and merge results
|
||||
hybrid_mode: If True, use hybrid search with RRF fusion
|
||||
enable_fuzzy: Enable fuzzy FTS in hybrid mode
|
||||
hybrid_weights: Custom RRF weights for hybrid search
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects (empty on error)
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
# Get FTS results
|
||||
if files_only:
|
||||
# Fast path: return paths only without snippets
|
||||
paths = store.search_files_only(query, limit=limit)
|
||||
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
else:
|
||||
fts_results = store.search_fts(query, limit=limit)
|
||||
|
||||
# Optionally add semantic keyword results
|
||||
if include_semantic:
|
||||
try:
|
||||
semantic_matches = store.search_semantic_keywords(query)
|
||||
# Convert semantic matches to SearchResult with 0.8x weight
|
||||
for file_entry, keywords in semantic_matches:
|
||||
# Create excerpt from keywords
|
||||
excerpt = f"Keywords: {', '.join(keywords[:5])}"
|
||||
# Use a base score of 10.0 for semantic matches, weighted by 0.8
|
||||
semantic_result = SearchResult(
|
||||
path=str(file_entry.full_path),
|
||||
score=10.0 * 0.8,
|
||||
excerpt=excerpt
|
||||
)
|
||||
fts_results.append(semantic_result)
|
||||
except Exception as sem_exc:
|
||||
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
|
||||
|
||||
return fts_results
|
||||
# Use hybrid search if enabled
|
||||
if hybrid_mode:
|
||||
hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
|
||||
fts_results = hybrid_engine.search(
|
||||
index_path,
|
||||
query,
|
||||
limit=limit,
|
||||
enable_fuzzy=enable_fuzzy,
|
||||
enable_vector=False, # Vector search not yet implemented
|
||||
)
|
||||
else:
|
||||
# Legacy single-FTS search
|
||||
with DirIndexStore(index_path) as store:
|
||||
# Get FTS results
|
||||
if files_only:
|
||||
# Fast path: return paths only without snippets
|
||||
paths = store.search_files_only(query, limit=limit)
|
||||
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
else:
|
||||
fts_results = store.search_fts(query, limit=limit)
|
||||
|
||||
# Optionally add semantic keyword results
|
||||
if include_semantic:
|
||||
try:
|
||||
semantic_matches = store.search_semantic_keywords(query)
|
||||
# Convert semantic matches to SearchResult with 0.8x weight
|
||||
for file_entry, keywords in semantic_matches:
|
||||
# Create excerpt from keywords
|
||||
excerpt = f"Keywords: {', '.join(keywords[:5])}"
|
||||
# Use a base score of 10.0 for semantic matches, weighted by 0.8
|
||||
semantic_result = SearchResult(
|
||||
path=str(file_entry.full_path),
|
||||
score=10.0 * 0.8,
|
||||
excerpt=excerpt
|
||||
)
|
||||
fts_results.append(semantic_result)
|
||||
except Exception as sem_exc:
|
||||
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
|
||||
|
||||
return fts_results
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Search error in {index_path}: {exc}")
|
||||
return []
|
||||
|
||||
211
codex-lens/src/codexlens/search/hybrid_search.py
Normal file
211
codex-lens/src/codexlens/search/hybrid_search.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
|
||||
|
||||
Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
|
||||
results via Reciprocal Rank Fusion (RRF) algorithm.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
|
||||
|
||||
class HybridSearchEngine:
|
||||
"""Hybrid search engine with parallel execution and RRF fusion.
|
||||
|
||||
Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
|
||||
executing them in parallel and fusing results via Reciprocal Rank Fusion.
|
||||
|
||||
Attributes:
|
||||
logger: Python logger instance
|
||||
default_weights: Default RRF weights for each source
|
||||
"""
|
||||
|
||||
# Default RRF weights (exact: 40%, fuzzy: 30%, vector: 30%)
|
||||
DEFAULT_WEIGHTS = {
|
||||
"exact": 0.4,
|
||||
"fuzzy": 0.3,
|
||||
"vector": 0.3,
|
||||
}
|
||||
|
||||
def __init__(self, weights: Optional[Dict[str, float]] = None):
|
||||
"""Initialize hybrid search engine.
|
||||
|
||||
Args:
|
||||
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
|
||||
"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
|
||||
|
||||
def search(
|
||||
self,
|
||||
index_path: Path,
|
||||
query: str,
|
||||
limit: int = 20,
|
||||
enable_fuzzy: bool = True,
|
||||
enable_vector: bool = False,
|
||||
) -> List[SearchResult]:
|
||||
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return after fusion
|
||||
enable_fuzzy: Enable fuzzy FTS search (default True)
|
||||
enable_vector: Enable vector search (default False)
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by fusion score
|
||||
|
||||
Examples:
|
||||
>>> engine = HybridSearchEngine()
|
||||
>>> results = engine.search(Path("project/_index.db"), "authentication")
|
||||
>>> for r in results[:5]:
|
||||
... print(f"{r.path}: {r.score:.3f}")
|
||||
"""
|
||||
# Determine which backends to use
|
||||
backends = {"exact": True} # Always use exact search
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
|
||||
# Execute parallel searches
|
||||
results_map = self._search_parallel(index_path, query, backends, limit)
|
||||
|
||||
# Apply RRF fusion
|
||||
# Filter weights to only active backends
|
||||
active_weights = {
|
||||
source: weight
|
||||
for source, weight in self.weights.items()
|
||||
if source in results_map
|
||||
}
|
||||
|
||||
fused_results = reciprocal_rank_fusion(results_map, active_weights)
|
||||
|
||||
# Apply final limit
|
||||
return fused_results[:limit]
|
||||
|
||||
def _search_parallel(
|
||||
self,
|
||||
index_path: Path,
|
||||
query: str,
|
||||
backends: Dict[str, bool],
|
||||
limit: int,
|
||||
) -> Dict[str, List[SearchResult]]:
|
||||
"""Execute parallel searches across enabled backends.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
backends: Dictionary of backend name to enabled flag
|
||||
limit: Results limit per backend
|
||||
|
||||
Returns:
|
||||
Dictionary mapping source name to results list
|
||||
"""
|
||||
results_map: Dict[str, List[SearchResult]] = {}
|
||||
|
||||
# Use ThreadPoolExecutor for parallel I/O-bound searches
|
||||
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
|
||||
# Submit search tasks
|
||||
future_to_source = {}
|
||||
|
||||
if backends.get("exact"):
|
||||
future = executor.submit(
|
||||
self._search_exact, index_path, query, limit
|
||||
)
|
||||
future_to_source[future] = "exact"
|
||||
|
||||
if backends.get("fuzzy"):
|
||||
future = executor.submit(
|
||||
self._search_fuzzy, index_path, query, limit
|
||||
)
|
||||
future_to_source[future] = "fuzzy"
|
||||
|
||||
if backends.get("vector"):
|
||||
future = executor.submit(
|
||||
self._search_vector, index_path, query, limit
|
||||
)
|
||||
future_to_source[future] = "vector"
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_source):
|
||||
source = future_to_source[future]
|
||||
try:
|
||||
results = future.result()
|
||||
# Tag results with source for debugging
|
||||
tagged_results = tag_search_source(results, source)
|
||||
results_map[source] = tagged_results
|
||||
self.logger.debug(
|
||||
"Got %d results from %s search", len(results), source
|
||||
)
|
||||
except Exception as exc:
|
||||
self.logger.error("Search failed for %s: %s", source, exc)
|
||||
results_map[source] = []
|
||||
|
||||
return results_map
|
||||
|
||||
def _search_exact(
|
||||
self, index_path: Path, query: str, limit: int
|
||||
) -> List[SearchResult]:
|
||||
"""Execute exact FTS search using unicode61 tokenizer.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
limit: Maximum results
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
return store.search_fts_exact(query, limit=limit)
|
||||
except Exception as exc:
|
||||
self.logger.debug("Exact search error: %s", exc)
|
||||
return []
|
||||
|
||||
def _search_fuzzy(
|
||||
self, index_path: Path, query: str, limit: int
|
||||
) -> List[SearchResult]:
|
||||
"""Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
limit: Maximum results
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
return store.search_fts_fuzzy(query, limit=limit)
|
||||
except Exception as exc:
|
||||
self.logger.debug("Fuzzy search error: %s", exc)
|
||||
return []
|
||||
|
||||
def _search_vector(
|
||||
self, index_path: Path, query: str, limit: int
|
||||
) -> List[SearchResult]:
|
||||
"""Execute vector search (placeholder for future implementation).
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: Query string
|
||||
limit: Maximum results
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects (empty for now)
|
||||
"""
|
||||
# Placeholder for vector search integration
|
||||
# Will be implemented when VectorStore is available
|
||||
self.logger.debug("Vector search not yet implemented")
|
||||
return []
|
||||
242
codex-lens/src/codexlens/search/query_parser.py
Normal file
242
codex-lens/src/codexlens/search/query_parser.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Query preprocessing for CodexLens search.
|
||||
|
||||
Provides query expansion for better identifier matching:
|
||||
- CamelCase splitting: UserAuth → User OR Auth
|
||||
- snake_case splitting: user_auth → user OR auth
|
||||
- Preserves original query for exact matching
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Set, List
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QueryParser:
|
||||
"""Parser for preprocessing search queries before FTS5 execution.
|
||||
|
||||
Expands identifier-style queries (CamelCase, snake_case) into OR queries
|
||||
to improve recall when searching for code symbols.
|
||||
|
||||
Example transformations:
|
||||
- 'UserAuth' → 'UserAuth OR User OR Auth'
|
||||
- 'user_auth' → 'user_auth OR user OR auth'
|
||||
- 'getUserData' → 'getUserData OR get OR User OR Data'
|
||||
"""
|
||||
|
||||
# Patterns for identifier splitting
|
||||
CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
|
||||
SNAKE_CASE_PATTERN = re.compile(r'_+')
|
||||
KEBAB_CASE_PATTERN = re.compile(r'-+')
|
||||
|
||||
# Minimum token length to include in expansion (avoid noise from single chars)
|
||||
MIN_TOKEN_LENGTH = 2
|
||||
|
||||
# All-caps acronyms pattern (e.g., HTTP, SQL, API)
|
||||
ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
|
||||
|
||||
def __init__(self, enable: bool = True, min_token_length: int = 2):
|
||||
"""Initialize query parser.
|
||||
|
||||
Args:
|
||||
enable: Whether to enable query preprocessing
|
||||
min_token_length: Minimum token length to include in expansion
|
||||
"""
|
||||
self.enable = enable
|
||||
self.min_token_length = min_token_length
|
||||
|
||||
def preprocess_query(self, query: str) -> str:
|
||||
"""Preprocess query with identifier expansion.
|
||||
|
||||
Args:
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
Expanded query with OR operator connecting original and split tokens
|
||||
|
||||
Example:
|
||||
>>> parser = QueryParser()
|
||||
>>> parser.preprocess_query('UserAuth')
|
||||
'UserAuth OR User OR Auth'
|
||||
>>> parser.preprocess_query('get_user_data')
|
||||
'get_user_data OR get OR user OR data'
|
||||
"""
|
||||
if not self.enable:
|
||||
return query
|
||||
|
||||
query = query.strip()
|
||||
if not query:
|
||||
return query
|
||||
|
||||
# Extract tokens from query (handle multiple words/terms)
|
||||
# For simple queries, just process the whole thing
|
||||
# For complex FTS5 queries with operators, preserve structure
|
||||
if self._is_simple_query(query):
|
||||
return self._expand_simple_query(query)
|
||||
else:
|
||||
# Complex query with FTS5 operators, don't expand
|
||||
log.debug(f"Skipping expansion for complex FTS5 query: {query}")
|
||||
return query
|
||||
|
||||
def _is_simple_query(self, query: str) -> bool:
|
||||
"""Check if query is simple (no FTS5 operators).
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
|
||||
Returns:
|
||||
True if query is simple (safe to expand), False otherwise
|
||||
"""
|
||||
# Check for FTS5 operators that indicate complex query
|
||||
fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
|
||||
return not any(op in query for op in fts5_operators)
|
||||
|
||||
def _expand_simple_query(self, query: str) -> str:
|
||||
"""Expand a simple query with identifier splitting.
|
||||
|
||||
Args:
|
||||
query: Simple search query
|
||||
|
||||
Returns:
|
||||
Expanded query with OR operators
|
||||
"""
|
||||
tokens: Set[str] = set()
|
||||
|
||||
# Always include original query
|
||||
tokens.add(query)
|
||||
|
||||
# Split on whitespace first
|
||||
words = query.split()
|
||||
|
||||
for word in words:
|
||||
# Extract tokens from this word
|
||||
word_tokens = self._extract_tokens(word)
|
||||
tokens.update(word_tokens)
|
||||
|
||||
# Filter out short tokens and duplicates
|
||||
filtered_tokens = [
|
||||
t for t in tokens
|
||||
if len(t) >= self.min_token_length
|
||||
]
|
||||
|
||||
# Remove duplicates while preserving original query first
|
||||
unique_tokens: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
# Always put original query first
|
||||
if query not in seen and len(query) >= self.min_token_length:
|
||||
unique_tokens.append(query)
|
||||
seen.add(query)
|
||||
|
||||
# Add other tokens
|
||||
for token in filtered_tokens:
|
||||
if token not in seen:
|
||||
unique_tokens.append(token)
|
||||
seen.add(token)
|
||||
|
||||
# Join with OR operator (only if we have multiple tokens)
|
||||
if len(unique_tokens) > 1:
|
||||
expanded = ' OR '.join(unique_tokens)
|
||||
log.debug(f"Expanded query: '{query}' → '{expanded}'")
|
||||
return expanded
|
||||
else:
|
||||
return query
|
||||
|
||||
def _extract_tokens(self, word: str) -> Set[str]:
|
||||
"""Extract tokens from a single word using various splitting strategies.
|
||||
|
||||
Args:
|
||||
word: Single word/identifier to split
|
||||
|
||||
Returns:
|
||||
Set of extracted tokens
|
||||
"""
|
||||
tokens: Set[str] = set()
|
||||
|
||||
# Add original word
|
||||
tokens.add(word)
|
||||
|
||||
# Handle all-caps acronyms (don't split)
|
||||
if self.ALL_CAPS_PATTERN.match(word):
|
||||
return tokens
|
||||
|
||||
# CamelCase splitting
|
||||
camel_tokens = self._split_camel_case(word)
|
||||
tokens.update(camel_tokens)
|
||||
|
||||
# snake_case splitting
|
||||
snake_tokens = self._split_snake_case(word)
|
||||
tokens.update(snake_tokens)
|
||||
|
||||
# kebab-case splitting
|
||||
kebab_tokens = self._split_kebab_case(word)
|
||||
tokens.update(kebab_tokens)
|
||||
|
||||
return tokens
|
||||
|
||||
def _split_camel_case(self, word: str) -> List[str]:
|
||||
"""Split CamelCase identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: CamelCase identifier (e.g., 'getUserData')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'User', 'Data'])
|
||||
"""
|
||||
# Insert space before uppercase letters preceded by lowercase
|
||||
spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
|
||||
# Split on spaces and filter empty
|
||||
return [t for t in spaced.split() if t]
|
||||
|
||||
def _split_snake_case(self, word: str) -> List[str]:
|
||||
"""Split snake_case identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: snake_case identifier (e.g., 'get_user_data')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'user', 'data'])
|
||||
"""
|
||||
# Split on underscores
|
||||
return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
|
||||
|
||||
def _split_kebab_case(self, word: str) -> List[str]:
|
||||
"""Split kebab-case identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: kebab-case identifier (e.g., 'get-user-data')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'user', 'data'])
|
||||
"""
|
||||
# Split on hyphens
|
||||
return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
|
||||
|
||||
|
||||
# Global default parser instance
|
||||
_default_parser = QueryParser(enable=True)
|
||||
|
||||
|
||||
def preprocess_query(query: str, enable: bool = True) -> str:
|
||||
"""Convenience function for query preprocessing.
|
||||
|
||||
Args:
|
||||
query: Original search query
|
||||
enable: Whether to enable preprocessing
|
||||
|
||||
Returns:
|
||||
Preprocessed query with identifier expansion
|
||||
"""
|
||||
if not enable:
|
||||
return query
|
||||
|
||||
return _default_parser.preprocess_query(query)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"QueryParser",
|
||||
"preprocess_query",
|
||||
]
|
||||
160
codex-lens/src/codexlens/search/ranking.py
Normal file
160
codex-lens/src/codexlens/search/ranking.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Ranking algorithms for hybrid search result fusion.
|
||||
|
||||
Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
|
||||
for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import Dict, List
|
||||
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
def reciprocal_rank_fusion(
|
||||
results_map: Dict[str, List[SearchResult]],
|
||||
weights: Dict[str, float] = None,
|
||||
k: int = 60,
|
||||
) -> List[SearchResult]:
|
||||
"""Combine search results from multiple sources using Reciprocal Rank Fusion.
|
||||
|
||||
RRF formula: score(d) = Σ weight_source / (k + rank_source(d))
|
||||
|
||||
Args:
|
||||
results_map: Dictionary mapping source name to list of SearchResult objects
|
||||
Sources: 'exact', 'fuzzy', 'vector'
|
||||
weights: Dictionary mapping source name to weight (default: equal weights)
|
||||
Example: {'exact': 0.4, 'fuzzy': 0.3, 'vector': 0.3}
|
||||
k: Constant to avoid division by zero and control rank influence (default 60)
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by fused score (descending)
|
||||
|
||||
Examples:
|
||||
>>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
||||
>>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
|
||||
>>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
|
||||
>>> fused = reciprocal_rank_fusion(results_map)
|
||||
"""
|
||||
if not results_map:
|
||||
return []
|
||||
|
||||
# Default equal weights if not provided
|
||||
if weights is None:
|
||||
num_sources = len(results_map)
|
||||
weights = {source: 1.0 / num_sources for source in results_map}
|
||||
|
||||
# Validate weights sum to 1.0
|
||||
weight_sum = sum(weights.values())
|
||||
if not math.isclose(weight_sum, 1.0, abs_tol=0.01):
|
||||
# Normalize weights to sum to 1.0
|
||||
weights = {source: w / weight_sum for source, w in weights.items()}
|
||||
|
||||
# Build unified result set with RRF scores
|
||||
path_to_result: Dict[str, SearchResult] = {}
|
||||
path_to_fusion_score: Dict[str, float] = {}
|
||||
|
||||
for source_name, results in results_map.items():
|
||||
weight = weights.get(source_name, 0.0)
|
||||
if weight == 0:
|
||||
continue
|
||||
|
||||
for rank, result in enumerate(results, start=1):
|
||||
path = result.path
|
||||
rrf_contribution = weight / (k + rank)
|
||||
|
||||
# Initialize or accumulate fusion score
|
||||
if path not in path_to_fusion_score:
|
||||
path_to_fusion_score[path] = 0.0
|
||||
path_to_result[path] = result
|
||||
|
||||
path_to_fusion_score[path] += rrf_contribution
|
||||
|
||||
# Create final results with fusion scores
|
||||
fused_results = []
|
||||
for path, base_result in path_to_result.items():
|
||||
fusion_score = path_to_fusion_score[path]
|
||||
|
||||
# Create new SearchResult with fusion_score in metadata
|
||||
fused_result = SearchResult(
|
||||
path=base_result.path,
|
||||
score=fusion_score,
|
||||
excerpt=base_result.excerpt,
|
||||
content=base_result.content,
|
||||
symbol=base_result.symbol,
|
||||
chunk=base_result.chunk,
|
||||
metadata={
|
||||
**base_result.metadata,
|
||||
"fusion_score": fusion_score,
|
||||
"original_score": base_result.score,
|
||||
},
|
||||
start_line=base_result.start_line,
|
||||
end_line=base_result.end_line,
|
||||
symbol_name=base_result.symbol_name,
|
||||
symbol_kind=base_result.symbol_kind,
|
||||
)
|
||||
fused_results.append(fused_result)
|
||||
|
||||
# Sort by fusion score descending
|
||||
fused_results.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
return fused_results
|
||||
|
||||
|
||||
def normalize_bm25_score(score: float) -> float:
|
||||
"""Normalize BM25 scores from SQLite FTS5 to 0-1 range.
|
||||
|
||||
SQLite FTS5 returns negative BM25 scores (more negative = better match).
|
||||
Uses sigmoid transformation for normalization.
|
||||
|
||||
Args:
|
||||
score: Raw BM25 score from SQLite (typically negative)
|
||||
|
||||
Returns:
|
||||
Normalized score in range [0, 1]
|
||||
|
||||
Examples:
|
||||
>>> normalize_bm25_score(-10.5) # Good match
|
||||
0.85
|
||||
>>> normalize_bm25_score(-1.2) # Weak match
|
||||
0.62
|
||||
"""
|
||||
# Take absolute value (BM25 is negative in SQLite)
|
||||
abs_score = abs(score)
|
||||
|
||||
# Sigmoid transformation: 1 / (1 + e^(-x))
|
||||
# Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1)
|
||||
normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]:
|
||||
"""Tag search results with their source for RRF tracking.
|
||||
|
||||
Args:
|
||||
results: List of SearchResult objects
|
||||
source: Source identifier ('exact', 'fuzzy', 'vector')
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects with 'search_source' in metadata
|
||||
"""
|
||||
tagged_results = []
|
||||
for result in results:
|
||||
tagged_result = SearchResult(
|
||||
path=result.path,
|
||||
score=result.score,
|
||||
excerpt=result.excerpt,
|
||||
content=result.content,
|
||||
symbol=result.symbol,
|
||||
chunk=result.chunk,
|
||||
metadata={**result.metadata, "search_source": source},
|
||||
start_line=result.start_line,
|
||||
end_line=result.end_line,
|
||||
symbol_name=result.symbol_name,
|
||||
symbol_kind=result.symbol_kind,
|
||||
)
|
||||
tagged_results.append(tagged_result)
|
||||
|
||||
return tagged_results
|
||||
@@ -57,7 +57,7 @@ class DirIndexStore:
|
||||
|
||||
# Schema version for migration tracking
|
||||
# Increment this when schema changes require migration
|
||||
SCHEMA_VERSION = 2
|
||||
SCHEMA_VERSION = 4
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
"""Initialize directory index store.
|
||||
@@ -93,11 +93,13 @@ class DirIndexStore:
|
||||
)
|
||||
|
||||
# Create or migrate schema
|
||||
self._create_schema(conn)
|
||||
self._create_fts_triggers(conn)
|
||||
|
||||
# Apply versioned migrations if needed
|
||||
if current_version < self.SCHEMA_VERSION:
|
||||
if current_version == 0:
|
||||
# New database - create schema directly
|
||||
self._create_schema(conn)
|
||||
self._create_fts_triggers(conn)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
elif current_version < self.SCHEMA_VERSION:
|
||||
# Existing database - apply migrations
|
||||
self._apply_migrations(conn, current_version)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
|
||||
@@ -126,6 +128,11 @@ class DirIndexStore:
|
||||
if from_version < 2:
|
||||
self._migrate_v2_add_name_column(conn)
|
||||
|
||||
# Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
|
||||
if from_version < 4:
|
||||
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
@@ -465,6 +472,117 @@ class DirIndexStore:
|
||||
|
||||
return float(row["mtime"]) if row and row["mtime"] else None
|
||||
|
||||
def needs_reindex(self, full_path: str | Path) -> bool:
|
||||
"""Check if a file needs reindexing based on mtime comparison.
|
||||
|
||||
Uses 1ms tolerance to handle filesystem timestamp precision variations.
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
|
||||
Returns:
|
||||
True if file should be reindexed (new, modified, or missing from index)
|
||||
"""
|
||||
full_path_obj = Path(full_path).resolve()
|
||||
if not full_path_obj.exists():
|
||||
return False # File doesn't exist, skip indexing
|
||||
|
||||
# Get current filesystem mtime
|
||||
try:
|
||||
current_mtime = full_path_obj.stat().st_mtime
|
||||
except OSError:
|
||||
return False # Can't read file stats, skip
|
||||
|
||||
# Get stored mtime from database
|
||||
stored_mtime = self.get_file_mtime(full_path_obj)
|
||||
|
||||
# File not in index, needs indexing
|
||||
if stored_mtime is None:
|
||||
return True
|
||||
|
||||
# Compare with 1ms tolerance for floating point precision
|
||||
MTIME_TOLERANCE = 0.001
|
||||
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
|
||||
|
||||
def add_file_incremental(
|
||||
self,
|
||||
name: str,
|
||||
full_path: str | Path,
|
||||
content: str,
|
||||
language: str,
|
||||
symbols: Optional[List[Symbol]] = None,
|
||||
) -> Optional[int]:
|
||||
"""Add or update a file only if it has changed (incremental indexing).
|
||||
|
||||
Checks mtime before indexing to skip unchanged files.
|
||||
|
||||
Args:
|
||||
name: Filename without path
|
||||
full_path: Complete source file path
|
||||
content: File content for indexing
|
||||
language: Programming language identifier
|
||||
symbols: List of Symbol objects from the file
|
||||
|
||||
Returns:
|
||||
Database file_id if indexed, None if skipped (unchanged)
|
||||
|
||||
Raises:
|
||||
StorageError: If database operations fail
|
||||
"""
|
||||
# Check if reindexing is needed
|
||||
if not self.needs_reindex(full_path):
|
||||
return None # Skip unchanged file
|
||||
|
||||
# File changed or new, perform full indexing
|
||||
return self.add_file(name, full_path, content, language, symbols)
|
||||
|
||||
def cleanup_deleted_files(self, source_dir: Path) -> int:
|
||||
"""Remove indexed files that no longer exist in the source directory.
|
||||
|
||||
Scans the source directory and removes database entries for deleted files.
|
||||
|
||||
Args:
|
||||
source_dir: Source directory to scan
|
||||
|
||||
Returns:
|
||||
Number of deleted file entries removed
|
||||
|
||||
Raises:
|
||||
StorageError: If cleanup operations fail
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_dir = source_dir.resolve()
|
||||
|
||||
try:
|
||||
# Get all indexed file paths
|
||||
rows = conn.execute("SELECT full_path FROM files").fetchall()
|
||||
indexed_paths = {row["full_path"] for row in rows}
|
||||
|
||||
# Build set of existing files in source directory
|
||||
existing_paths = set()
|
||||
for file_path in source_dir.rglob("*"):
|
||||
if file_path.is_file():
|
||||
existing_paths.add(str(file_path.resolve()))
|
||||
|
||||
# Find orphaned entries (indexed but no longer exist)
|
||||
deleted_paths = indexed_paths - existing_paths
|
||||
|
||||
# Remove orphaned entries
|
||||
deleted_count = 0
|
||||
for deleted_path in deleted_paths:
|
||||
conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
|
||||
deleted_count += 1
|
||||
|
||||
if deleted_count > 0:
|
||||
conn.commit()
|
||||
|
||||
return deleted_count
|
||||
|
||||
except Exception as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
|
||||
|
||||
def list_files(self) -> List[FileEntry]:
|
||||
"""List all files in current directory.
|
||||
|
||||
@@ -985,6 +1103,92 @@ class DirIndexStore:
|
||||
)
|
||||
return results
|
||||
|
||||
def search_fts_exact(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
"""Full-text search using exact token matching (unicode61 tokenizer).
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
|
||||
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts_exact
|
||||
WHERE files_fts_exact MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS exact search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["full_path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_fts_fuzzy(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
"""Full-text search using fuzzy/substring matching (trigram or extended unicode61 tokenizer).
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts_fuzzy) AS rank,
|
||||
snippet(files_fts_fuzzy, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts_fuzzy
|
||||
WHERE files_fts_fuzzy MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS fuzzy search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["full_path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_files_only(self, query: str, limit: int = 20) -> List[str]:
|
||||
"""Fast FTS search returning only file paths (no snippet generation).
|
||||
|
||||
@@ -1185,16 +1389,34 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# FTS5 external content table with code-friendly tokenizer
|
||||
# unicode61 tokenchars keeps underscores as part of tokens
|
||||
# so 'user_id' is indexed as one token, not 'user' and 'id'
|
||||
# Dual FTS5 external content tables for exact and fuzzy matching
|
||||
# files_fts_exact: unicode61 tokenizer for exact token matching
|
||||
# files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support
|
||||
|
||||
has_trigram = check_trigram_support(conn)
|
||||
fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
|
||||
|
||||
# Exact FTS table with unicode61 tokenizer
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_'"
|
||||
tokenize="unicode61 tokenchars '_-'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Fuzzy FTS table with trigram or extended unicode61 tokenizer
|
||||
conn.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
@@ -1301,38 +1523,72 @@ class DirIndexStore:
|
||||
conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))
|
||||
|
||||
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create FTS5 external content triggers.
|
||||
"""Create FTS5 external content triggers for dual FTS tables.
|
||||
|
||||
Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
"""
|
||||
# Insert trigger
|
||||
# Insert triggers for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Delete trigger
|
||||
# Delete trigger for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Update trigger
|
||||
# Update trigger for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Insert trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Delete trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Update trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
|
||||
@@ -77,7 +77,7 @@ class IndexTreeBuilder:
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, registry: RegistryStore, mapper: PathMapper, config: Config = None
|
||||
self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
|
||||
):
|
||||
"""Initialize the index tree builder.
|
||||
|
||||
@@ -85,18 +85,21 @@ class IndexTreeBuilder:
|
||||
registry: Global registry store for project tracking
|
||||
mapper: Path mapper for source to index conversions
|
||||
config: CodexLens configuration (uses defaults if None)
|
||||
incremental: Enable incremental indexing (default True)
|
||||
"""
|
||||
self.registry = registry
|
||||
self.mapper = mapper
|
||||
self.config = config or Config()
|
||||
self.parser_factory = ParserFactory(self.config)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.incremental = incremental
|
||||
|
||||
def build(
|
||||
self,
|
||||
source_root: Path,
|
||||
languages: List[str] = None,
|
||||
workers: int = 4,
|
||||
force_full: bool = False,
|
||||
) -> BuildResult:
|
||||
"""Build complete index tree for a project.
|
||||
|
||||
@@ -106,11 +109,13 @@ class IndexTreeBuilder:
|
||||
3. Build indexes bottom-up (deepest first)
|
||||
4. Link subdirectories to parents
|
||||
5. Update project statistics
|
||||
6. Cleanup deleted files (if incremental mode)
|
||||
|
||||
Args:
|
||||
source_root: Project root directory to index
|
||||
languages: Optional list of language IDs to limit indexing
|
||||
workers: Number of parallel worker processes
|
||||
force_full: Force full reindex (override incremental mode)
|
||||
|
||||
Returns:
|
||||
BuildResult with statistics and errors
|
||||
@@ -122,7 +127,12 @@ class IndexTreeBuilder:
|
||||
if not source_root.exists():
|
||||
raise ValueError(f"Source root does not exist: {source_root}")
|
||||
|
||||
self.logger.info("Building index tree for %s", source_root)
|
||||
# Override incremental mode if force_full is True
|
||||
use_incremental = self.incremental and not force_full
|
||||
if force_full:
|
||||
self.logger.info("Building index tree for %s (FULL reindex)", source_root)
|
||||
else:
|
||||
self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)
|
||||
|
||||
# Register project
|
||||
index_root = self.mapper.source_to_index_dir(source_root)
|
||||
@@ -186,6 +196,25 @@ class IndexTreeBuilder:
|
||||
# Link children to this directory
|
||||
self._link_children_to_parent(result.source_path, all_results)
|
||||
|
||||
# Cleanup deleted files if in incremental mode
|
||||
if use_incremental:
|
||||
self.logger.info("Cleaning up deleted files...")
|
||||
total_deleted = 0
|
||||
for result in all_results:
|
||||
if result.error:
|
||||
continue
|
||||
try:
|
||||
with DirIndexStore(result.index_path) as store:
|
||||
deleted_count = store.cleanup_deleted_files(result.source_path)
|
||||
total_deleted += deleted_count
|
||||
if deleted_count > 0:
|
||||
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
|
||||
except Exception as exc:
|
||||
self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
|
||||
|
||||
if total_deleted > 0:
|
||||
self.logger.info("Removed %d deleted files from index", total_deleted)
|
||||
|
||||
# Update project statistics
|
||||
self.registry.update_project_stats(source_root, total_files, total_dirs)
|
||||
|
||||
@@ -436,9 +465,15 @@ class IndexTreeBuilder:
|
||||
|
||||
files_count = 0
|
||||
symbols_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for file_path in source_files:
|
||||
try:
|
||||
# Check if file needs reindexing (incremental mode)
|
||||
if self.incremental and not store.needs_reindex(file_path):
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Read and parse file
|
||||
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language_id = self.config.language_for_path(file_path)
|
||||
@@ -491,13 +526,23 @@ class IndexTreeBuilder:
|
||||
|
||||
store.close()
|
||||
|
||||
self.logger.debug(
|
||||
"Built %s: %d files, %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
if skipped_count > 0:
|
||||
self.logger.debug(
|
||||
"Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
skipped_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
else:
|
||||
self.logger.debug(
|
||||
"Built %s: %d files, %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
|
||||
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Migration 004: Add dual FTS tables for exact and fuzzy matching.
|
||||
|
||||
This migration introduces two FTS5 tables:
|
||||
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
|
||||
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
|
||||
Both tables are synchronized with the files table via triggers for automatic updates.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add dual FTS tables.
|
||||
|
||||
- Drops old files_fts table and triggers
|
||||
- Creates files_fts_exact with unicode61 tokenizer
|
||||
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
|
||||
- Creates synchronized triggers for both tables
|
||||
- Rebuilds FTS indexes from files table
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
try:
|
||||
# Check trigram support
|
||||
has_trigram = check_trigram_support(db_conn)
|
||||
version = get_sqlite_version(db_conn)
|
||||
log.info(f"SQLite version: {'.'.join(map(str, version))}")
|
||||
|
||||
if has_trigram:
|
||||
log.info("Trigram tokenizer available, using for fuzzy FTS table")
|
||||
fuzzy_tokenizer = "trigram"
|
||||
else:
|
||||
log.warning(
|
||||
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
|
||||
f"using extended unicode61 tokenizer for fuzzy matching"
|
||||
)
|
||||
fuzzy_tokenizer = "unicode61 tokenchars '_-'"
|
||||
|
||||
# Start transaction
|
||||
cursor.execute("BEGIN TRANSACTION")
|
||||
|
||||
# Check if files table has 'name' column (v2 schema doesn't have it)
|
||||
cursor.execute("PRAGMA table_info(files)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'name' not in columns:
|
||||
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
|
||||
# Add name column
|
||||
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
|
||||
# Populate name from path (extract filename from last '/')
|
||||
# Use Python to do the extraction since SQLite doesn't have reverse()
|
||||
cursor.execute("SELECT rowid, path FROM files")
|
||||
rows = cursor.fetchall()
|
||||
for rowid, path in rows:
|
||||
# Extract filename from path
|
||||
name = path.split('/')[-1] if '/' in path else path
|
||||
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
|
||||
|
||||
# Rename 'path' column to 'full_path' if needed
|
||||
if 'path' in columns and 'full_path' not in columns:
|
||||
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
|
||||
# Check if indexed_at column exists in v2 schema
|
||||
has_indexed_at = 'indexed_at' in columns
|
||||
has_mtime = 'mtime' in columns
|
||||
|
||||
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
|
||||
cursor.execute("""
|
||||
CREATE TABLE files_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
full_path TEXT NOT NULL UNIQUE,
|
||||
content TEXT,
|
||||
language TEXT,
|
||||
mtime REAL,
|
||||
indexed_at TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
|
||||
if has_indexed_at and has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
|
||||
SELECT name, path, content, language, mtime, indexed_at FROM files
|
||||
""")
|
||||
elif has_indexed_at:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, indexed_at)
|
||||
SELECT name, path, content, language, indexed_at FROM files
|
||||
""")
|
||||
elif has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime)
|
||||
SELECT name, path, content, language, mtime FROM files
|
||||
""")
|
||||
else:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language)
|
||||
SELECT name, path, content, language FROM files
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE files")
|
||||
cursor.execute("ALTER TABLE files_new RENAME TO files")
|
||||
|
||||
log.info("Dropping old FTS triggers and table...")
|
||||
# Drop old triggers
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_au")
|
||||
|
||||
# Drop old FTS table
|
||||
cursor.execute("DROP TABLE IF EXISTS files_fts")
|
||||
|
||||
# Create exact FTS table (unicode61 with underscores/hyphens as token chars)
|
||||
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_-'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create fuzzy FTS table (trigram or extended unicode61)
|
||||
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
|
||||
cursor.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_exact
|
||||
log.info("Creating triggers for files_fts_exact...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_fuzzy
|
||||
log.info("Creating triggers for files_fts_fuzzy...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Rebuild FTS indexes from files table
|
||||
log.info("Rebuilding FTS indexes from files table...")
|
||||
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
|
||||
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
|
||||
|
||||
# Commit transaction
|
||||
cursor.execute("COMMIT")
|
||||
log.info("Migration 004 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction)
|
||||
try:
|
||||
log.info("Running VACUUM to reclaim space...")
|
||||
cursor.execute("VACUUM")
|
||||
except Exception as e:
|
||||
log.warning(f"VACUUM failed (non-critical): {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Migration 004 failed: {e}")
|
||||
try:
|
||||
cursor.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
64
codex-lens/src/codexlens/storage/sqlite_utils.py
Normal file
64
codex-lens/src/codexlens/storage/sqlite_utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""SQLite utility functions for CodexLens storage layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_trigram_support(conn: sqlite3.Connection) -> bool:
|
||||
"""Check if SQLite supports trigram tokenizer for FTS5.
|
||||
|
||||
Trigram tokenizer requires SQLite >= 3.34.0.
|
||||
|
||||
Args:
|
||||
conn: Database connection to test
|
||||
|
||||
Returns:
|
||||
True if trigram tokenizer is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Test by creating a temporary virtual table with trigram tokenizer
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
|
||||
USING fts5(test_content, tokenize='trigram')
|
||||
"""
|
||||
)
|
||||
# Clean up test table
|
||||
conn.execute("DROP TABLE IF EXISTS test_trigram_check")
|
||||
conn.commit()
|
||||
return True
|
||||
except sqlite3.OperationalError as e:
|
||||
# Trigram tokenizer not available
|
||||
if "unrecognized tokenizer" in str(e).lower():
|
||||
log.debug("Trigram tokenizer not available in this SQLite version")
|
||||
return False
|
||||
# Other operational errors should be re-raised
|
||||
raise
|
||||
except Exception:
|
||||
# Any other exception means trigram is not supported
|
||||
return False
|
||||
|
||||
|
||||
def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
|
||||
"""Get SQLite version as (major, minor, patch) tuple.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
|
||||
Returns:
|
||||
Version tuple, e.g., (3, 34, 1)
|
||||
"""
|
||||
row = conn.execute("SELECT sqlite_version()").fetchone()
|
||||
version_str = row[0] if row else "0.0.0"
|
||||
parts = version_str.split('.')
|
||||
try:
|
||||
major = int(parts[0]) if len(parts) > 0 else 0
|
||||
minor = int(parts[1]) if len(parts) > 1 else 0
|
||||
patch = int(parts[2]) if len(parts) > 2 else 0
|
||||
return (major, minor, patch)
|
||||
except (ValueError, IndexError):
|
||||
return (0, 0, 0)
|
||||
Reference in New Issue
Block a user