feat: Enhance search functionality with quality tiers and scoped indexing

- Updated `search_code` function to include a `quality` parameter for search quality tiers: "fast", "balanced", "thorough", and "auto".
- Introduced `search_scope` function to limit search results to a specific directory scope.
- Added `index_scope` function for indexing a specific directory without re-indexing the entire project.
- Refactored `SearchPipeline` to support quality-based routing in the `search` method.
- Implemented `Shard` and `ShardManager` classes to manage multiple index shards with LRU eviction and efficient file routing.
- Added debounce functionality in `IncrementalIndexer` to batch file events and reduce redundant processing.
- Enhanced `FileWatcher` to integrate with `IncrementalIndexer` for improved event handling.
This commit is contained in:
catlog22
2026-03-19 17:47:53 +08:00
parent 54071473fc
commit 18aff260a0
46 changed files with 1537 additions and 658 deletions

View File

@@ -124,6 +124,19 @@ def create_config_from_env(db_path: str | Path, **overrides: object) -> "Config"
kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"])
if os.environ.get("CODEXLENS_HNSW_M"):
kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"])
# Tier config from env
if os.environ.get("CODEXLENS_TIER_HOT_HOURS"):
kwargs["tier_hot_hours"] = int(os.environ["CODEXLENS_TIER_HOT_HOURS"])
if os.environ.get("CODEXLENS_TIER_COLD_HOURS"):
kwargs["tier_cold_hours"] = int(os.environ["CODEXLENS_TIER_COLD_HOURS"])
# Search quality tier from env
if os.environ.get("CODEXLENS_SEARCH_QUALITY"):
kwargs["default_search_quality"] = os.environ["CODEXLENS_SEARCH_QUALITY"]
# Shard config from env
if os.environ.get("CODEXLENS_NUM_SHARDS"):
kwargs["num_shards"] = int(os.environ["CODEXLENS_NUM_SHARDS"])
if os.environ.get("CODEXLENS_MAX_LOADED_SHARDS"):
kwargs["max_loaded_shards"] = int(os.environ["CODEXLENS_MAX_LOADED_SHARDS"])
resolved = Path(db_path).resolve()
kwargs["metadata_db_path"] = str(resolved / "metadata.db")
return Config(**kwargs)
@@ -143,28 +156,8 @@ def _create_config(args: argparse.Namespace) -> "Config":
return create_config_from_env(args.db_path, **overrides)
def create_pipeline(
db_path: str | Path,
config: "Config | None" = None,
) -> tuple:
"""Construct pipeline components from db_path and config.
Returns (indexing_pipeline, search_pipeline, config).
Used by both CLI bridge and MCP server.
"""
from codexlens_search.config import Config
from codexlens_search.core.factory import create_ann_index, create_binary_index
from codexlens_search.indexing.metadata import MetadataStore
from codexlens_search.indexing.pipeline import IndexingPipeline
from codexlens_search.search.fts import FTSEngine
from codexlens_search.search.pipeline import SearchPipeline
if config is None:
config = create_config_from_env(db_path)
resolved = Path(db_path).resolve()
resolved.mkdir(parents=True, exist_ok=True)
# Select embedder: API if configured, otherwise local fastembed
def _create_embedder(config: "Config"):
"""Create embedder based on config, auto-detecting embed_dim from API."""
if config.embed_api_url:
from codexlens_search.embed.api import APIEmbedder
embedder = APIEmbedder(config)
@@ -179,13 +172,11 @@ def create_pipeline(
else:
from codexlens_search.embed.local import FastEmbedEmbedder
embedder = FastEmbedEmbedder(config)
return embedder
binary_store = create_binary_index(resolved, config.embed_dim, config)
ann_index = create_ann_index(resolved, config.embed_dim, config)
fts = FTSEngine(resolved / "fts.db")
metadata = MetadataStore(resolved / "metadata.db")
# Select reranker: API if configured, otherwise local fastembed
def _create_reranker(config: "Config"):
"""Create reranker based on config."""
if config.reranker_api_url:
from codexlens_search.rerank.api import APIReranker
reranker = APIReranker(config)
@@ -193,6 +184,60 @@ def create_pipeline(
else:
from codexlens_search.rerank.local import FastEmbedReranker
reranker = FastEmbedReranker(config)
return reranker
def create_pipeline(
db_path: str | Path,
config: "Config | None" = None,
) -> tuple:
"""Construct pipeline components from db_path and config.
Returns (indexing_pipeline, search_pipeline, config).
Used by both CLI bridge and MCP server.
When config.num_shards > 1, returns a ShardManager-backed pipeline
where indexing and search are delegated to the ShardManager.
The returned tuple is (shard_manager, shard_manager, config) so that
callers can use shard_manager.sync() and shard_manager.search().
"""
from codexlens_search.config import Config
if config is None:
config = create_config_from_env(db_path)
resolved = Path(db_path).resolve()
resolved.mkdir(parents=True, exist_ok=True)
embedder = _create_embedder(config)
reranker = _create_reranker(config)
# Sharded mode: delegate to ShardManager
if config.num_shards > 1:
from codexlens_search.core.shard_manager import ShardManager
manager = ShardManager(
num_shards=config.num_shards,
db_path=resolved,
config=config,
embedder=embedder,
reranker=reranker,
)
log.info(
"Using ShardManager with %d shards (max_loaded=%d)",
config.num_shards, config.max_loaded_shards,
)
return manager, manager, config
# Single-shard mode: original behavior, no ShardManager overhead
from codexlens_search.core.factory import create_ann_index, create_binary_index
from codexlens_search.indexing.metadata import MetadataStore
from codexlens_search.indexing.pipeline import IndexingPipeline
from codexlens_search.search.fts import FTSEngine
from codexlens_search.search.pipeline import SearchPipeline
binary_store = create_binary_index(resolved, config.embed_dim, config)
ann_index = create_ann_index(resolved, config.embed_dim, config)
fts = FTSEngine(resolved / "fts.db")
metadata = MetadataStore(resolved / "metadata.db")
indexing = IndexingPipeline(
embedder=embedder,

View File

@@ -47,7 +47,7 @@ class Config:
# Backend selection: 'auto', 'faiss', 'hnswlib'
ann_backend: str = "auto"
binary_backend: str = "auto"
binary_backend: str = "faiss"
# Indexing pipeline
index_workers: int = 2 # number of parallel indexing workers
@@ -77,6 +77,17 @@ class Config:
# Metadata store
metadata_db_path: str = "" # empty = no metadata tracking
# Data tiering (hot/warm/cold)
tier_hot_hours: int = 24 # files accessed within this window are 'hot'
tier_cold_hours: int = 168 # files not accessed for this long are 'cold'
# Search quality tier: 'fast', 'balanced', 'thorough', 'auto'
default_search_quality: str = "auto"
# Shard partitioning
num_shards: int = 1 # 1 = single partition (no sharding), >1 = hash-based sharding
max_loaded_shards: int = 4 # LRU limit for loaded shards in ShardManager
# FTS
fts_top_k: int = 50

View File

@@ -15,6 +15,13 @@ logger = logging.getLogger(__name__)
class BinaryStore(BaseBinaryIndex):
"""Persistent binary vector store using numpy memmap.
.. deprecated::
Prefer ``FAISSBinaryIndex`` for binary coarse search. This class is
retained as a numpy-only fallback for environments where FAISS is not
available. New code should use ``create_binary_index()`` from
``codexlens_search.core.factory`` which selects the best backend
automatically.
Stores binary-quantized float32 vectors as packed uint8 arrays on disk.
Supports fast coarse search via XOR + popcount Hamming distance.
"""

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import logging
import warnings
from pathlib import Path
from codexlens_search.config import Config
@@ -97,14 +98,29 @@ def create_binary_index(
backend = config.binary_backend
if backend == "faiss":
from codexlens_search.core.faiss_index import FAISSBinaryIndex
return FAISSBinaryIndex(path, dim, config)
if _FAISS_AVAILABLE:
from codexlens_search.core.faiss_index import FAISSBinaryIndex
return FAISSBinaryIndex(path, dim, config)
# FAISS explicitly requested but not installed: fall back with warning
from codexlens_search.core.binary import BinaryStore
warnings.warn(
"binary_backend='faiss' but FAISS is not installed. "
"Falling back to deprecated numpy BinaryStore. "
"Install faiss-cpu or faiss-gpu for the recommended binary backend.",
DeprecationWarning,
stacklevel=2,
)
logger.warning(
"binary_backend='faiss' but FAISS not available, "
"falling back to deprecated numpy BinaryStore."
)
return BinaryStore(path, dim, config)
if backend == "hnswlib":
from codexlens_search.core.binary import BinaryStore
return BinaryStore(path, dim, config)
# auto: try faiss first, then numpy-based BinaryStore
# auto: try faiss first, then numpy-based BinaryStore (deprecated fallback)
if _FAISS_AVAILABLE:
from codexlens_search.core.faiss_index import FAISSBinaryIndex
logger.info("Auto-selected FAISS binary backend")
@@ -112,5 +128,14 @@ def create_binary_index(
# numpy BinaryStore is always available (no extra deps)
from codexlens_search.core.binary import BinaryStore
logger.info("Auto-selected numpy BinaryStore backend")
warnings.warn(
"Falling back to numpy BinaryStore because FAISS is not installed. "
"BinaryStore is deprecated; install faiss-cpu or faiss-gpu for better performance.",
DeprecationWarning,
stacklevel=2,
)
logger.warning(
"FAISS not available, falling back to deprecated numpy BinaryStore. "
"Install faiss-cpu or faiss-gpu for the recommended binary backend."
)
return BinaryStore(path, dim, config)

View File

@@ -71,10 +71,23 @@ class FAISSANNIndex(BaseANNIndex):
self.load()
def load(self) -> None:
"""Load index from disk or initialize a fresh one."""
"""Load index from disk or initialize a fresh one.
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
falling back to regular read_index() on older faiss versions.
"""
with self._lock:
if self._index_path.exists():
idx = faiss.read_index(str(self._index_path))
try:
idx = faiss.read_index(
str(self._index_path), faiss.IO_FLAG_MMAP
)
except (AttributeError, RuntimeError, Exception) as exc:
logger.debug(
"MMAP load failed, falling back to regular read: %s",
exc,
)
idx = faiss.read_index(str(self._index_path))
logger.debug(
"Loaded FAISS ANN index from %s (%d items)",
self._index_path, idx.ntotal,
@@ -201,10 +214,23 @@ class FAISSBinaryIndex(BaseBinaryIndex):
return np.packbits(binary).reshape(1, -1)
def load(self) -> None:
"""Load binary index from disk or initialize a fresh one."""
"""Load binary index from disk or initialize a fresh one.
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
falling back to regular read_index_binary() on older faiss versions.
"""
with self._lock:
if self._index_path.exists():
idx = faiss.read_index_binary(str(self._index_path))
try:
idx = faiss.read_index_binary(
str(self._index_path), faiss.IO_FLAG_MMAP
)
except (AttributeError, RuntimeError, Exception) as exc:
logger.debug(
"MMAP load failed, falling back to regular read: %s",
exc,
)
idx = faiss.read_index_binary(str(self._index_path))
logger.debug(
"Loaded FAISS binary index from %s (%d items)",
self._index_path, idx.ntotal,

View File

@@ -0,0 +1,178 @@
"""Single index partition (shard) that owns FTS, binary, ANN, and metadata stores."""
from __future__ import annotations
import logging
from pathlib import Path
from codexlens_search.config import Config
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
from codexlens_search.embed.base import BaseEmbedder
from codexlens_search.indexing.metadata import MetadataStore
from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats
from codexlens_search.rerank import BaseReranker
from codexlens_search.search.fts import FTSEngine
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
logger = logging.getLogger(__name__)
class Shard:
"""A complete index partition with its own FTS, binary, ANN, and metadata stores.
Components are lazy-loaded on first access and can be explicitly unloaded
to release memory. The embedder and reranker are shared across shards
(passed in from ShardManager) since they are expensive to instantiate.
"""
def __init__(
self,
shard_id: int,
db_path: str | Path,
config: Config,
) -> None:
self._shard_id = shard_id
self._shard_dir = Path(db_path).resolve() / f"shard_{shard_id}"
self._config = config
# Lazy-loaded components (created on _ensure_loaded)
self._fts: FTSEngine | None = None
self._binary_store: BaseBinaryIndex | None = None
self._ann_index: BaseANNIndex | None = None
self._metadata: MetadataStore | None = None
self._indexing: IndexingPipeline | None = None
self._search: SearchPipeline | None = None
self._loaded = False
@property
def shard_id(self) -> int:
return self._shard_id
@property
def is_loaded(self) -> bool:
return self._loaded
def _ensure_loaded(
self,
embedder: BaseEmbedder,
reranker: BaseReranker,
) -> None:
"""Lazy-create all per-shard components if not yet loaded."""
if self._loaded:
return
from codexlens_search.core.factory import create_ann_index, create_binary_index
self._shard_dir.mkdir(parents=True, exist_ok=True)
self._fts = FTSEngine(self._shard_dir / "fts.db")
self._binary_store = create_binary_index(
self._shard_dir, self._config.embed_dim, self._config
)
self._ann_index = create_ann_index(
self._shard_dir, self._config.embed_dim, self._config
)
self._metadata = MetadataStore(self._shard_dir / "metadata.db")
self._indexing = IndexingPipeline(
embedder=embedder,
binary_store=self._binary_store,
ann_index=self._ann_index,
fts=self._fts,
config=self._config,
metadata=self._metadata,
)
self._search = SearchPipeline(
embedder=embedder,
binary_store=self._binary_store,
ann_index=self._ann_index,
reranker=reranker,
fts=self._fts,
config=self._config,
metadata_store=self._metadata,
)
self._loaded = True
logger.debug("Shard %d loaded from %s", self._shard_id, self._shard_dir)
def unload(self) -> None:
"""Release memory by closing connections and dropping references."""
if not self._loaded:
return
if self._metadata is not None:
self._metadata.close()
self._fts = None
self._binary_store = None
self._ann_index = None
self._metadata = None
self._indexing = None
self._search = None
self._loaded = False
logger.debug("Shard %d unloaded", self._shard_id)
def load(
self,
embedder: BaseEmbedder,
reranker: BaseReranker,
) -> None:
"""Explicitly load shard components."""
self._ensure_loaded(embedder, reranker)
def save(self) -> None:
"""Persist binary and ANN indexes to disk."""
if not self._loaded:
return
if self._binary_store is not None:
self._binary_store.save()
if self._ann_index is not None:
self._ann_index.save()
def search(
self,
query: str,
embedder: BaseEmbedder,
reranker: BaseReranker,
quality: str | None = None,
top_k: int | None = None,
) -> list[SearchResult]:
"""Search this shard's index.
Args:
query: Search query string.
embedder: Shared embedder instance.
reranker: Shared reranker instance.
quality: Search quality tier.
top_k: Maximum results to return.
Returns:
List of SearchResult from this shard.
"""
self._ensure_loaded(embedder, reranker)
assert self._search is not None
return self._search.search(query, top_k=top_k, quality=quality)
def sync(
self,
files: list[Path],
root: Path | None,
embedder: BaseEmbedder,
reranker: BaseReranker,
**kwargs: object,
) -> IndexStats:
"""Sync this shard's index with the given files.
Args:
files: Files that belong to this shard.
root: Root directory for relative paths.
embedder: Shared embedder instance.
reranker: Shared reranker instance.
**kwargs: Forwarded to IndexingPipeline.sync().
Returns:
IndexStats for this shard's sync operation.
"""
self._ensure_loaded(embedder, reranker)
assert self._indexing is not None
return self._indexing.sync(files, root=root, **kwargs)

View File

@@ -0,0 +1,250 @@
"""ShardManager: manages multiple Shard instances with LRU eviction."""
from __future__ import annotations
import logging
import threading
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from codexlens_search.config import Config
from codexlens_search.core.shard import Shard
from codexlens_search.embed.base import BaseEmbedder
from codexlens_search.indexing.pipeline import IndexStats
from codexlens_search.rerank import BaseReranker
from codexlens_search.search.fusion import reciprocal_rank_fusion
from codexlens_search.search.pipeline import SearchResult
logger = logging.getLogger(__name__)
class ShardManager:
"""Manages multiple Shard instances with hash-based file routing and LRU eviction.
Files are deterministically routed to shards via hash(path) % num_shards.
Search queries all shards in parallel and merges results via RRF fusion.
At most max_loaded_shards are kept in memory; least-recently-used shards
are unloaded when the limit is exceeded.
"""
def __init__(
self,
num_shards: int,
db_path: str | Path,
config: Config,
embedder: BaseEmbedder,
reranker: BaseReranker,
) -> None:
if num_shards < 1:
raise ValueError("num_shards must be >= 1")
self._num_shards = num_shards
self._db_path = Path(db_path).resolve()
self._config = config
self._embedder = embedder
self._reranker = reranker
self._max_loaded = config.max_loaded_shards
# Create all Shard objects (lazy-loaded, no I/O yet)
self._shards: dict[int, Shard] = {
i: Shard(i, self._db_path, config)
for i in range(num_shards)
}
# LRU tracking: keys are shard_ids, most-recently-used at end
self._loaded_order: OrderedDict[int, None] = OrderedDict()
self._lru_lock = threading.Lock()
@property
def num_shards(self) -> int:
return self._num_shards
def route_file(self, path: str) -> int:
"""Deterministically route a file path to a shard ID.
Uses hash(path) % num_shards for uniform distribution.
"""
return hash(path) % self._num_shards
def get_shard(self, shard_id: int) -> Shard:
"""Return the Shard instance for a given shard_id."""
if shard_id not in self._shards:
raise ValueError(
f"Invalid shard_id {shard_id}, valid range: 0-{self._num_shards - 1}"
)
return self._shards[shard_id]
def _ensure_loaded(self, shard_id: int) -> Shard:
"""Load a shard if needed, applying LRU eviction policy.
Thread-safe: protects OrderedDict mutations with a lock.
Returns the loaded Shard.
"""
shard = self._shards[shard_id]
with self._lru_lock:
# Mark as most-recently-used
if shard_id in self._loaded_order:
self._loaded_order.move_to_end(shard_id)
else:
self._loaded_order[shard_id] = None
# Load if not already loaded
if not shard.is_loaded:
shard.load(self._embedder, self._reranker)
# Evict LRU shards if over limit
while len(self._loaded_order) > self._max_loaded:
evict_id, _ = self._loaded_order.popitem(last=False)
evict_shard = self._shards[evict_id]
if evict_shard.is_loaded:
logger.info("LRU evicting shard %d", evict_id)
evict_shard.unload()
return shard
def sync(
self,
files: list[Path],
root: Path | None = None,
**kwargs: object,
) -> IndexStats:
"""Sync index with files, routing each file to its shard.
Groups files by shard via route_file(), then syncs each shard
with its subset of files.
Args:
files: Current list of files to index.
root: Root directory for relative paths.
**kwargs: Forwarded to Shard.sync().
Returns:
Aggregated IndexStats across all shards.
"""
# Group files by shard
shard_files: dict[int, list[Path]] = {i: [] for i in range(self._num_shards)}
for fpath in files:
rel = str(fpath.relative_to(root)) if root else str(fpath)
shard_id = self.route_file(rel)
shard_files[shard_id].append(fpath)
total_files = 0
total_chunks = 0
total_duration = 0.0
for shard_id, shard_file_list in shard_files.items():
if not shard_file_list:
continue
self._ensure_loaded(shard_id)
shard = self._shards[shard_id]
stats = shard.sync(
shard_file_list,
root=root,
embedder=self._embedder,
reranker=self._reranker,
**kwargs,
)
total_files += stats.files_processed
total_chunks += stats.chunks_created
total_duration += stats.duration_seconds
return IndexStats(
files_processed=total_files,
chunks_created=total_chunks,
duration_seconds=round(total_duration, 2),
)
def search(
self,
query: str,
quality: str | None = None,
top_k: int | None = None,
) -> list[SearchResult]:
"""Search all shards in parallel, merge results via RRF fusion.
Each shard returns its own ranked results. Cross-shard merging
uses reciprocal_rank_fusion with equal weights across shards.
Per-shard top_k is increased to compensate for cross-shard dilution.
Args:
query: Search query string.
quality: Search quality tier.
top_k: Maximum final results to return.
Returns:
Merged list of SearchResult ordered by relevance.
"""
cfg = self._config
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
# Increase per-shard top_k to get enough candidates for cross-shard RRF
per_shard_top_k = max(final_top_k, final_top_k * 2)
# Load all shards for search
for shard_id in range(self._num_shards):
self._ensure_loaded(shard_id)
# Parallel search across shards
shard_results: dict[int, list[SearchResult]] = {}
def _search_shard(sid: int) -> tuple[int, list[SearchResult]]:
shard = self._shards[sid]
results = shard.search(
query,
embedder=self._embedder,
reranker=self._reranker,
quality=quality,
top_k=per_shard_top_k,
)
return sid, results
with ThreadPoolExecutor(max_workers=min(self._num_shards, 4)) as pool:
futures = [pool.submit(_search_shard, sid) for sid in range(self._num_shards)]
for future in futures:
try:
sid, results = future.result()
shard_results[sid] = results
except Exception:
logger.warning("Shard search failed", exc_info=True)
# If only one shard returned results, no merging needed
non_empty = {k: v for k, v in shard_results.items() if v}
if not non_empty:
return []
if len(non_empty) == 1:
results = list(non_empty.values())[0]
return results[:final_top_k]
# Cross-shard RRF merge
# Build ranked lists keyed by shard name, with (doc_id, score) tuples
# Use a global result map to look up SearchResult by a unique key
# Since doc_ids are shard-local, we need a composite key
rrf_input: dict[str, list[tuple[int, float]]] = {}
global_results: dict[int, SearchResult] = {}
global_id = 0
for sid, results in non_empty.items():
ranked: list[tuple[int, float]] = []
for r in results:
global_results[global_id] = r
ranked.append((global_id, r.score))
global_id += 1
rrf_input[f"shard_{sid}"] = ranked
fused = reciprocal_rank_fusion(rrf_input, k=cfg.fusion_k)
merged: list[SearchResult] = []
for gid, fused_score in fused[:final_top_k]:
result = global_results[gid]
merged.append(SearchResult(
id=result.id,
path=result.path,
score=fused_score,
snippet=result.snippet,
line=result.line,
end_line=result.end_line,
content=result.content,
))
return merged

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import sqlite3
import time
from pathlib import Path
@@ -9,7 +10,8 @@ class MetadataStore:
"""Tracks file-to-chunk mappings and deleted chunk IDs (tombstones).
Tables:
files - file_path (PK), content_hash, last_modified
files - file_path (PK), content_hash, last_modified, file_size,
tier ('hot'/'warm'/'cold'), last_accessed (epoch float)
chunks - chunk_id (PK), file_path (FK CASCADE), chunk_hash
deleted_chunks - chunk_id (PK) for tombstone tracking
"""
@@ -19,13 +21,18 @@ class MetadataStore:
self._conn.execute("PRAGMA foreign_keys = ON")
self._conn.execute("PRAGMA journal_mode = WAL")
self._create_tables()
self._migrate_size_column()
self._migrate_tier_columns()
def _create_tables(self) -> None:
self._conn.executescript("""
CREATE TABLE IF NOT EXISTS files (
file_path TEXT PRIMARY KEY,
content_hash TEXT NOT NULL,
last_modified REAL NOT NULL
last_modified REAL NOT NULL,
file_size INTEGER NOT NULL DEFAULT 0,
tier TEXT NOT NULL DEFAULT 'warm',
last_accessed REAL
);
CREATE TABLE IF NOT EXISTS chunks (
@@ -41,14 +48,48 @@ class MetadataStore:
""")
self._conn.commit()
def _migrate_size_column(self) -> None:
"""Add file_size column if missing (for pre-existing DBs)."""
cols = {
row[1]
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
}
if "file_size" not in cols:
self._conn.execute(
"ALTER TABLE files ADD COLUMN file_size INTEGER NOT NULL DEFAULT 0"
)
self._conn.commit()
def _migrate_tier_columns(self) -> None:
"""Add tier and last_accessed columns if missing (for pre-existing DBs)."""
cols = {
row[1]
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
}
if "tier" not in cols:
self._conn.execute(
"ALTER TABLE files ADD COLUMN tier TEXT NOT NULL DEFAULT 'warm'"
)
if "last_accessed" not in cols:
self._conn.execute(
"ALTER TABLE files ADD COLUMN last_accessed REAL"
)
if "tier" not in cols or "last_accessed" not in cols:
self._conn.commit()
def register_file(
self, file_path: str, content_hash: str, mtime: float
self,
file_path: str,
content_hash: str,
mtime: float,
file_size: int = 0,
) -> None:
"""Insert or update a file record."""
self._conn.execute(
"INSERT OR REPLACE INTO files (file_path, content_hash, last_modified) "
"VALUES (?, ?, ?)",
(file_path, content_hash, mtime),
"INSERT OR REPLACE INTO files "
"(file_path, content_hash, last_modified, file_size) "
"VALUES (?, ?, ?, ?)",
(file_path, content_hash, mtime, file_size),
)
self._conn.commit()
@@ -121,6 +162,24 @@ class MetadataStore:
return True # New file
return stored != content_hash
def file_needs_update_fast(
self, file_path: str, mtime: float, size: int
) -> bool:
"""Fast pre-check using mtime and file size (no content read needed).
Returns True if the file appears changed or is not yet tracked.
When mtime and size both match stored values, the file is assumed
unchanged (~1000x faster than content-hash comparison).
"""
row = self._conn.execute(
"SELECT last_modified, file_size FROM files WHERE file_path = ?",
(file_path,),
).fetchone()
if row is None:
return True # New file
stored_mtime, stored_size = row
return stored_mtime != mtime or stored_size != size
def compact_deleted(self) -> set[int]:
"""Return deleted IDs and clear the deleted_chunks table.
@@ -161,5 +220,81 @@ class MetadataStore:
).fetchone()
return row[0] if row[0] is not None else -1
# ------------------------------------------------------------------
# Tier management
# ------------------------------------------------------------------
def record_access(self, file_path: str) -> None:
"""Update last_accessed timestamp for a file."""
self._conn.execute(
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
(time.time(), file_path),
)
self._conn.commit()
def record_access_batch(self, file_paths: list[str]) -> None:
"""Batch-update last_accessed timestamps for multiple files."""
if not file_paths:
return
now = time.time()
self._conn.executemany(
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
[(now, fp) for fp in file_paths],
)
self._conn.commit()
def classify_tiers(
self, hot_threshold_hours: int = 24, cold_threshold_hours: int = 168
) -> None:
"""Reclassify all files into hot/warm/cold tiers based on last_accessed.
- hot: last_accessed within hot_threshold_hours
- cold: last_accessed older than cold_threshold_hours (or never accessed)
- warm: everything in between
"""
now = time.time()
hot_cutoff = now - hot_threshold_hours * 3600
cold_cutoff = now - cold_threshold_hours * 3600
# Hot: recently accessed
self._conn.execute(
"UPDATE files SET tier = 'hot' "
"WHERE last_accessed IS NOT NULL AND last_accessed >= ?",
(hot_cutoff,),
)
# Cold: not accessed for a long time, or never accessed
self._conn.execute(
"UPDATE files SET tier = 'cold' "
"WHERE last_accessed IS NULL "
"OR (last_accessed < ? AND last_accessed < ?)",
(cold_cutoff, hot_cutoff),
)
# Warm: between hot and cold cutoffs
self._conn.execute(
"UPDATE files SET tier = 'warm' "
"WHERE last_accessed IS NOT NULL "
"AND last_accessed >= ? AND last_accessed < ?",
(cold_cutoff, hot_cutoff),
)
self._conn.commit()
def get_files_by_tier(self, tier: str) -> list[str]:
"""Return file paths in the specified tier ('hot', 'warm', or 'cold')."""
rows = self._conn.execute(
"SELECT file_path FROM files WHERE tier = ?", (tier,)
).fetchall()
return [r[0] for r in rows]
def get_cold_files(self) -> list[str]:
"""Return file paths in the 'cold' tier."""
return self.get_files_by_tier("cold")
def get_file_tier(self, file_path: str) -> str | None:
"""Return the tier for a specific file, or None if not tracked."""
row = self._conn.execute(
"SELECT tier FROM files WHERE file_path = ?", (file_path,)
).fetchone()
return row[0] if row else None
def close(self) -> None:
self._conn.close()

View File

@@ -17,8 +17,7 @@ from pathlib import Path
import numpy as np
from codexlens_search.config import Config
from codexlens_search.core.binary import BinaryStore
from codexlens_search.core.index import ANNIndex
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
from codexlens_search.embed.base import BaseEmbedder
from codexlens_search.indexing.metadata import MetadataStore
from codexlens_search.search.fts import FTSEngine
@@ -100,8 +99,8 @@ class IndexingPipeline:
def __init__(
self,
embedder: BaseEmbedder,
binary_store: BinaryStore,
ann_index: ANNIndex,
binary_store: BaseBinaryIndex,
ann_index: BaseANNIndex,
fts: FTSEngine,
config: Config,
metadata: MetadataStore | None = None,
@@ -463,6 +462,94 @@ class IndexingPipeline:
meta = self._require_metadata()
return meta.max_chunk_id() + 1
def index_files_fts_only(
self,
files: list[Path],
*,
root: Path | None = None,
max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
) -> IndexStats:
"""Index files into FTS5 only, without embedding or vector indexing.
Chunks files using the same logic as the full pipeline, then inserts
directly into FTS. No embedding computation, no binary/ANN store writes.
Args:
files: List of file paths to index.
root: Optional root for computing relative paths.
max_chunk_chars: Maximum characters per chunk.
chunk_overlap: Character overlap between consecutive chunks.
Returns:
IndexStats with counts and timing.
"""
if not files:
return IndexStats()
meta = self._require_metadata()
t0 = time.monotonic()
chunk_id = self._next_chunk_id()
files_processed = 0
chunks_created = 0
for fpath in files:
exclude_reason = is_file_excluded(fpath, self._config)
if exclude_reason:
logger.debug("Skipping %s: %s", fpath, exclude_reason)
continue
try:
text = fpath.read_text(encoding="utf-8", errors="replace")
except Exception as exc:
logger.debug("Skipping %s: %s", fpath, exc)
continue
rel_path = str(fpath.relative_to(root)) if root else str(fpath)
content_hash = self._content_hash(text)
# Skip unchanged files
if not meta.file_needs_update(rel_path, content_hash):
continue
# Remove old FTS data if file was previously indexed
if meta.get_file_hash(rel_path) is not None:
meta.mark_file_deleted(rel_path)
self._fts.delete_by_path(rel_path)
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
if not file_chunks:
st = fpath.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
continue
files_processed += 1
fts_docs = []
chunk_id_hashes = []
for chunk_text, path, sl, el in file_chunks:
fts_docs.append((chunk_id, path, chunk_text, sl, el))
chunk_id_hashes.append((chunk_id, self._content_hash(chunk_text)))
chunk_id += 1
self._fts.add_documents(fts_docs)
chunks_created += len(fts_docs)
# Register metadata
st = fpath.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
meta.register_chunks(rel_path, chunk_id_hashes)
duration = time.monotonic() - t0
stats = IndexStats(
files_processed=files_processed,
chunks_created=chunks_created,
duration_seconds=round(duration, 2),
)
logger.info(
"FTS-only indexing complete: %d files, %d chunks in %.1fs",
stats.files_processed, stats.chunks_created, stats.duration_seconds,
)
return stats
def index_file(
self,
file_path: Path,
@@ -522,7 +609,8 @@ class IndexingPipeline:
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
if not file_chunks:
# Register file with no chunks
meta.register_file(rel_path, content_hash, file_path.stat().st_mtime)
st = file_path.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
return IndexStats(
files_processed=1,
duration_seconds=round(time.monotonic() - t0, 2),
@@ -556,7 +644,8 @@ class IndexingPipeline:
self._fts.add_documents(fts_docs)
# Register in metadata
meta.register_file(rel_path, content_hash, file_path.stat().st_mtime)
st = file_path.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
chunk_id_hashes = [
(batch_ids[i], self._content_hash(batch_texts[i]))
for i in range(len(batch_ids))
@@ -605,6 +694,7 @@ class IndexingPipeline:
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
max_file_size: int = 50_000,
progress_callback: callable | None = None,
tier: str = "full",
) -> IndexStats:
"""Reconcile index state against a current file list.
@@ -617,6 +707,9 @@ class IndexingPipeline:
max_chunk_chars: Maximum characters per chunk.
chunk_overlap: Character overlap between consecutive chunks.
max_file_size: Skip files larger than this (bytes).
tier: Indexing tier - 'full' (default) runs the full pipeline
with embedding, 'fts_only' runs FTS-only indexing without
embedding or vector stores.
Returns:
Aggregated IndexStats for all operations.
@@ -638,33 +731,72 @@ class IndexingPipeline:
for rel in removed:
self.remove_file(rel)
# Collect files needing update
# Collect files needing update using 4-level detection:
# Level 1: set diff (removed files) - handled above
# Level 2: mtime + size fast pre-check via stat()
# Level 3: content hash only when mtime/size mismatch
files_to_index: list[Path] = []
for rel, fpath in current_rel_paths.items():
# Level 2: stat-based fast check
try:
st = fpath.stat()
except OSError:
continue
if not meta.file_needs_update_fast(rel, st.st_mtime, st.st_size):
# mtime + size match stored values -> skip (no read needed)
continue
# Level 3: mtime/size changed -> verify with content hash
try:
text = fpath.read_text(encoding="utf-8", errors="replace")
except Exception:
continue
content_hash = self._content_hash(text)
if meta.file_needs_update(rel, content_hash):
# Remove old data if previously indexed
if meta.get_file_hash(rel) is not None:
meta.mark_file_deleted(rel)
self._fts.delete_by_path(rel)
files_to_index.append(fpath)
if not meta.file_needs_update(rel, content_hash):
# Content unchanged despite mtime/size change -> update metadata only
meta.register_file(rel, content_hash, st.st_mtime, st.st_size)
continue
# Batch index via parallel pipeline
# File genuinely changed -> remove old data and queue for re-index
if meta.get_file_hash(rel) is not None:
meta.mark_file_deleted(rel)
self._fts.delete_by_path(rel)
files_to_index.append(fpath)
# Sort files by data tier priority: hot first, then warm, then cold
if files_to_index:
# Set starting chunk ID from metadata
start_id = self._next_chunk_id()
batch_stats = self._index_files_with_metadata(
files_to_index,
root=root,
max_chunk_chars=max_chunk_chars,
chunk_overlap=chunk_overlap,
start_chunk_id=start_id,
progress_callback=progress_callback,
)
_tier_priority = {"hot": 0, "warm": 1, "cold": 2}
def _tier_sort_key(fp: Path) -> int:
rel = str(fp.relative_to(root)) if root else str(fp)
t = meta.get_file_tier(rel)
return _tier_priority.get(t or "warm", 1)
files_to_index.sort(key=_tier_sort_key)
# Reclassify data tiers after sync detection
meta.classify_tiers(
self._config.tier_hot_hours, self._config.tier_cold_hours
)
# Batch index via parallel pipeline or FTS-only
if files_to_index:
if tier == "fts_only":
batch_stats = self.index_files_fts_only(
files_to_index,
root=root,
max_chunk_chars=max_chunk_chars,
chunk_overlap=chunk_overlap,
)
else:
# Full pipeline with embedding
start_id = self._next_chunk_id()
batch_stats = self._index_files_with_metadata(
files_to_index,
root=root,
max_chunk_chars=max_chunk_chars,
chunk_overlap=chunk_overlap,
start_chunk_id=start_id,
progress_callback=progress_callback,
)
total_files = batch_stats.files_processed
total_chunks = batch_stats.chunks_created
else:
@@ -781,7 +913,8 @@ class IndexingPipeline:
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
if not file_chunks:
meta.register_file(rel_path, content_hash, fpath.stat().st_mtime)
st = fpath.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
continue
files_processed += 1
@@ -806,7 +939,8 @@ class IndexingPipeline:
chunks_created += len(file_chunk_ids)
# Register metadata per file
meta.register_file(rel_path, content_hash, fpath.stat().st_mtime)
st = fpath.stat()
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
chunk_id_hashes = [
(cid, self._content_hash(ct)) for cid, ct in file_chunk_ids
]

View File

@@ -102,13 +102,20 @@ def _get_pipelines(project_path: str) -> tuple:
# ---------------------------------------------------------------------------
@mcp.tool()
def search_code(project_path: str, query: str, top_k: int = 10) -> str:
def search_code(
project_path: str, query: str, top_k: int = 10, quality: str = "auto"
) -> str:
"""Semantic code search with hybrid fusion (vector + FTS + reranking).
Args:
project_path: Absolute path to the project root directory.
query: Natural language or code search query.
top_k: Maximum number of results to return (default 10).
quality: Search quality tier (default "auto"):
- "fast": FTS-only + rerank (no embedding needed, fastest)
- "balanced": FTS + binary coarse search + rerank
- "thorough": Full 2-stage vector + FTS + reranking (best quality)
- "auto": Uses "thorough" if vector index exists, else "fast"
Returns:
Search results as formatted text with file paths, line numbers, scores, and code snippets.
@@ -121,15 +128,75 @@ def search_code(project_path: str, query: str, top_k: int = 10) -> str:
if not (db_path / "metadata.db").exists():
return f"Error: no index found at {db_path}. Run index_project first."
valid_qualities = ("fast", "balanced", "thorough", "auto")
if quality not in valid_qualities:
return f"Error: invalid quality '{quality}'. Must be one of: {', '.join(valid_qualities)}"
_, search, _ = _get_pipelines(project_path)
results = search.search(query, top_k=top_k)
results = search.search(query, top_k=top_k, quality=quality)
if not results:
return "No results found."
lines = []
for i, r in enumerate(results, 1):
lines.append(f"## Result {i} {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
lines.append(f"```\n{r.content}\n```")
lines.append("")
return "\n".join(lines)
@mcp.tool()
def search_scope(
project_path: str,
query: str,
scope_path: str,
top_k: int = 10,
quality: str = "auto",
) -> str:
"""Search within a specific directory scope of a project.
Runs a normal search then filters results to only include files
under the specified scope path.
Args:
project_path: Absolute path to the project root directory.
query: Natural language or code search query.
scope_path: Relative directory path to limit search scope (e.g. "src/auth").
top_k: Maximum number of scoped results to return (default 10).
quality: Search quality tier ("fast", "balanced", "thorough", "auto").
Returns:
Search results filtered to the scope path.
"""
root = Path(project_path).resolve()
if not root.is_dir():
return f"Error: project path not found: {root}"
db_path = _db_path_for_project(project_path)
if not (db_path / "metadata.db").exists():
return f"Error: no index found at {db_path}. Run index_project first."
# Normalize scope path for prefix matching
scope = scope_path.replace("\\", "/").strip("/")
_, search, _ = _get_pipelines(project_path)
# Fetch more results than top_k to account for filtering
all_results = search.search(query, top_k=top_k * 5, quality=quality)
# Filter by scope path prefix
scoped = [
r for r in all_results
if r.path.replace("\\", "/").startswith(scope + "/")
or r.path.replace("\\", "/") == scope
]
if not scoped:
return f"No results found in scope '{scope_path}'."
lines = []
for i, r in enumerate(scoped[:top_k], 1):
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
lines.append(f"```\n{r.content}\n```")
lines.append("")
return "\n".join(lines)
@@ -275,6 +342,59 @@ async def index_update(
)
@mcp.tool()
def index_scope(
project_path: str,
scope_path: str,
glob_pattern: str = "**/*",
tier: str = "full",
) -> str:
"""Index a specific directory scope within a project.
Useful for quickly indexing a subdirectory (e.g. after editing files
in a specific module) without re-indexing the entire project.
Args:
project_path: Absolute path to the project root directory.
scope_path: Relative directory path to index (e.g. "src/auth").
glob_pattern: Glob pattern for files within scope (default "**/*").
tier: Indexing tier - "full" (default) runs full pipeline with
embedding, "fts_only" indexes text only (faster, no vectors).
Returns:
Indexing summary for the scoped directory.
"""
root = Path(project_path).resolve()
if not root.is_dir():
return f"Error: project path not found: {root}"
scope_dir = root / scope_path
if not scope_dir.is_dir():
return f"Error: scope directory not found: {scope_dir}"
valid_tiers = ("full", "fts_only")
if tier not in valid_tiers:
return f"Error: invalid tier '{tier}'. Must be one of: {', '.join(valid_tiers)}"
indexing, _, _ = _get_pipelines(project_path)
file_paths = [
p for p in scope_dir.glob(glob_pattern)
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
]
if not file_paths:
return f"No files found in {scope_path} matching '{glob_pattern}'."
stats = indexing.sync(file_paths, root=root, tier=tier)
tier_label = "FTS-only" if tier == "fts_only" else "full"
return (
f"Indexed {stats.files_processed} files ({tier_label}), "
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. "
f"Scope: {scope_path}"
)
# ---------------------------------------------------------------------------
# File discovery
# ---------------------------------------------------------------------------

View File

@@ -7,7 +7,7 @@ from dataclasses import dataclass
import numpy as np
from ..config import Config
from ..core import ANNIndex, BinaryStore
from ..core.base import BaseANNIndex, BaseBinaryIndex
from ..embed import BaseEmbedder
from ..indexing.metadata import MetadataStore
from ..rerank import BaseReranker
@@ -21,6 +21,8 @@ from .fusion import (
_log = logging.getLogger(__name__)
_VALID_QUALITIES = ("fast", "balanced", "thorough", "auto")
@dataclass
class SearchResult:
@@ -37,8 +39,8 @@ class SearchPipeline:
def __init__(
self,
embedder: BaseEmbedder,
binary_store: BinaryStore,
ann_index: ANNIndex,
binary_store: BaseBinaryIndex,
ann_index: BaseANNIndex,
reranker: BaseReranker,
fts: FTSEngine,
config: Config,
@@ -52,6 +54,15 @@ class SearchPipeline:
self._config = config
self._metadata_store = metadata_store
# -- Helper: check if vector index has data ----------------------------
def _has_vector_index(self) -> bool:
"""Check if the binary store has any indexed entries."""
try:
return len(self._binary_store) > 0
except Exception:
return False
# -- Helper: vector search (binary coarse + ANN fine) -----------------
def _vector_search(
@@ -84,6 +95,21 @@ class SearchPipeline:
]
return vector_results
# -- Helper: binary coarse search only --------------------------------
def _binary_coarse_search(
self, query_vec: np.ndarray
) -> list[tuple[int, float]]:
"""Run binary coarse search only (no ANN fine search)."""
cfg = self._config
candidate_ids, distances = self._binary_store.coarse_search(
query_vec, top_k=cfg.binary_top_k
)
return [
(int(doc_id), float(dist))
for doc_id, dist in zip(candidate_ids, distances)
]
# -- Helper: FTS search (exact + fuzzy) ------------------------------
def _fts_search(
@@ -95,55 +121,12 @@ class SearchPipeline:
fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k)
return exact_results, fuzzy_results
# -- Main search entry point -----------------------------------------
# -- Helper: filter deleted IDs ---------------------------------------
def search(self, query: str, top_k: int | None = None) -> list[SearchResult]:
cfg = self._config
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
# 1. Detect intent -> adaptive weights
intent = detect_query_intent(query)
weights = get_adaptive_weights(intent, cfg.fusion_weights)
# 2. Embed query
query_vec = self._embedder.embed_single(query)
# 3. Parallel vector + FTS search
vector_results: list[tuple[int, float]] = []
exact_results: list[tuple[int, float]] = []
fuzzy_results: list[tuple[int, float]] = []
with ThreadPoolExecutor(max_workers=2) as pool:
vec_future = pool.submit(self._vector_search, query_vec)
fts_future = pool.submit(self._fts_search, query)
# Collect vector results
try:
vector_results = vec_future.result()
except Exception:
_log.warning("Vector search failed, using empty results", exc_info=True)
# Collect FTS results
try:
exact_results, fuzzy_results = fts_future.result()
except Exception:
_log.warning("FTS search failed, using empty results", exc_info=True)
# 4. RRF fusion
fusion_input: dict[str, list[tuple[int, float]]] = {}
if vector_results:
fusion_input["vector"] = vector_results
if exact_results:
fusion_input["exact"] = exact_results
if fuzzy_results:
fusion_input["fuzzy"] = fuzzy_results
if not fusion_input:
return []
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
# 4b. Filter out deleted IDs (tombstone filtering)
def _filter_deleted(
self, fused: list[tuple[int, float]]
) -> list[tuple[int, float]]:
"""Remove tombstoned chunk IDs from results."""
if self._metadata_store is not None:
deleted_ids = self._metadata_store.get_deleted_ids()
if deleted_ids:
@@ -152,16 +135,30 @@ class SearchPipeline:
for doc_id, score in fused
if doc_id not in deleted_ids
]
return fused
# 5. Rerank top candidates
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
rerank_scores = self._reranker.score_pairs(query, contents)
# -- Helper: rerank and build results ---------------------------------
# 6. Sort by rerank score, build SearchResult list
ranked = sorted(
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
)
def _rerank_and_build(
self,
query: str,
fused: list[tuple[int, float]],
final_top_k: int,
use_reranker: bool = True,
) -> list[SearchResult]:
"""Rerank candidates (optionally) and build SearchResult list."""
if not fused:
return []
if use_reranker:
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
rerank_scores = self._reranker.score_pairs(query, contents)
ranked = sorted(
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
)
else:
ranked = fused
results: list[SearchResult] = []
for doc_id, score in ranked[:final_top_k]:
@@ -179,3 +176,178 @@ class SearchPipeline:
)
)
return results
# -- Helper: record access for tier tracking --------------------------
def _record_access(self, results: list[SearchResult]) -> None:
"""Record file access for data tier tracking."""
if results and self._metadata_store is not None:
unique_paths = list({r.path for r in results})
try:
self._metadata_store.record_access_batch(unique_paths)
except Exception:
_log.debug("Failed to record access for tier tracking", exc_info=True)
# -- Quality-routed search methods ------------------------------------
def _search_fast(
self, query: str, final_top_k: int
) -> list[SearchResult]:
"""FTS-only search with reranking. No embedding needed."""
exact_results, fuzzy_results = self._fts_search(query)
fusion_input: dict[str, list[tuple[int, float]]] = {}
if exact_results:
fusion_input["exact"] = exact_results
if fuzzy_results:
fusion_input["fuzzy"] = fuzzy_results
if not fusion_input:
return []
fused = reciprocal_rank_fusion(
fusion_input, weights={"exact": 0.7, "fuzzy": 0.3},
k=self._config.fusion_k,
)
fused = self._filter_deleted(fused)
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
def _search_balanced(
self, query: str, final_top_k: int
) -> list[SearchResult]:
"""FTS + binary coarse search with RRF fusion and reranking.
Embeds the query for binary coarse search but skips ANN fine search.
"""
intent = detect_query_intent(query)
weights = get_adaptive_weights(intent, self._config.fusion_weights)
query_vec = self._embedder.embed_single(query)
# Parallel: binary coarse + FTS
coarse_results: list[tuple[int, float]] = []
exact_results: list[tuple[int, float]] = []
fuzzy_results: list[tuple[int, float]] = []
with ThreadPoolExecutor(max_workers=2) as pool:
coarse_future = pool.submit(self._binary_coarse_search, query_vec)
fts_future = pool.submit(self._fts_search, query)
try:
coarse_results = coarse_future.result()
except Exception:
_log.warning("Binary coarse search failed", exc_info=True)
try:
exact_results, fuzzy_results = fts_future.result()
except Exception:
_log.warning("FTS search failed", exc_info=True)
fusion_input: dict[str, list[tuple[int, float]]] = {}
if coarse_results:
fusion_input["vector"] = coarse_results
if exact_results:
fusion_input["exact"] = exact_results
if fuzzy_results:
fusion_input["fuzzy"] = fuzzy_results
if not fusion_input:
return []
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=self._config.fusion_k)
fused = self._filter_deleted(fused)
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
def _search_thorough(
self, query: str, final_top_k: int
) -> list[SearchResult]:
"""Full 2-stage vector + FTS + reranking pipeline (original behavior)."""
cfg = self._config
intent = detect_query_intent(query)
weights = get_adaptive_weights(intent, cfg.fusion_weights)
query_vec = self._embedder.embed_single(query)
# Parallel vector + FTS search
vector_results: list[tuple[int, float]] = []
exact_results: list[tuple[int, float]] = []
fuzzy_results: list[tuple[int, float]] = []
with ThreadPoolExecutor(max_workers=2) as pool:
vec_future = pool.submit(self._vector_search, query_vec)
fts_future = pool.submit(self._fts_search, query)
try:
vector_results = vec_future.result()
except Exception:
_log.warning("Vector search failed, using empty results", exc_info=True)
try:
exact_results, fuzzy_results = fts_future.result()
except Exception:
_log.warning("FTS search failed, using empty results", exc_info=True)
fusion_input: dict[str, list[tuple[int, float]]] = {}
if vector_results:
fusion_input["vector"] = vector_results
if exact_results:
fusion_input["exact"] = exact_results
if fuzzy_results:
fusion_input["fuzzy"] = fuzzy_results
if not fusion_input:
return []
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
fused = self._filter_deleted(fused)
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
# -- Main search entry point -----------------------------------------
def search(
self,
query: str,
top_k: int | None = None,
quality: str | None = None,
) -> list[SearchResult]:
"""Search with quality-based routing.
Args:
query: Search query string.
top_k: Maximum results to return.
quality: Search quality tier:
- 'fast': FTS-only + rerank (no embedding, no vector search)
- 'balanced': FTS + binary coarse + rerank (no ANN fine search)
- 'thorough': Full 2-stage vector + FTS + reranking
- 'auto': Selects 'thorough' if vectors exist, else 'fast'
- None: Uses config.default_search_quality
Returns:
List of SearchResult ordered by relevance.
"""
cfg = self._config
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
# Resolve quality tier
effective_quality = quality or cfg.default_search_quality
if effective_quality not in _VALID_QUALITIES:
_log.warning(
"Invalid search quality '%s', falling back to 'auto'",
effective_quality,
)
effective_quality = "auto"
# Auto-detect: use thorough if vector index has data, else fast
if effective_quality == "auto":
effective_quality = "thorough" if self._has_vector_index() else "fast"
if effective_quality == "fast":
results = self._search_fast(query, final_top_k)
elif effective_quality == "balanced":
results = self._search_balanced(query, final_top_k)
else:
results = self._search_thorough(query, final_top_k)
self._record_access(results)
return results

View File

@@ -20,6 +20,7 @@ from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
from .events import ChangeType, FileEvent, WatcherConfig
from .incremental_indexer import IncrementalIndexer
logger = logging.getLogger(__name__)
@@ -261,3 +262,24 @@ class FileWatcher:
if output:
sys.stdout.write(output + "\n")
sys.stdout.flush()
@classmethod
def create_with_indexer(
cls,
root_path: Path,
config: WatcherConfig,
indexer: IncrementalIndexer,
) -> "FileWatcher":
"""Create a FileWatcher wired to an IncrementalIndexer's async path.
Uses ``indexer.process_events_async()`` as the callback so that
events are debounced and batched within the indexer before
processing, preventing redundant per-file pipeline startups.
Example::
indexer = IncrementalIndexer(pipeline, root=root)
watcher = FileWatcher.create_with_indexer(root, config, indexer)
watcher.start()
"""
return cls(root_path, config, indexer.process_events_async)

View File

@@ -4,10 +4,13 @@ Ported from codex-lens v1 with simplifications:
- Uses IndexingPipeline.index_file() / remove_file() directly
- No v1-specific Config, ParserFactory, DirIndexStore dependencies
- Per-file error isolation: one failure does not stop batch processing
- Debounce batching: process_events_async() buffers events and flushes
after a configurable window to prevent redundant per-file pipeline startups
"""
from __future__ import annotations
import logging
import threading
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
@@ -60,6 +63,7 @@ class IncrementalIndexer:
pipeline: IndexingPipeline,
*,
root: Optional[Path] = None,
debounce_window_ms: int = 500,
) -> None:
"""Initialize the incremental indexer.
@@ -67,9 +71,15 @@ class IncrementalIndexer:
pipeline: The indexing pipeline with metadata store configured.
root: Optional project root for computing relative paths.
If None, absolute paths are used as identifiers.
debounce_window_ms: Milliseconds to buffer events before flushing
in process_events_async(). Default 500ms.
"""
self._pipeline = pipeline
self._root = root
self._debounce_window_ms = debounce_window_ms
self._event_buffer: List[FileEvent] = []
self._buffer_lock = threading.Lock()
self._flush_timer: Optional[threading.Timer] = None
def process_events(self, events: List[FileEvent]) -> BatchResult:
"""Process a batch of file events with per-file error isolation.
@@ -107,6 +117,52 @@ class IncrementalIndexer:
return result
def process_events_async(self, events: List[FileEvent]) -> None:
"""Buffer events and flush after the debounce window expires.
Non-blocking: events are accumulated in an internal buffer.
When no new events arrive within *debounce_window_ms*, the buffer
is flushed and all accumulated events are processed as a single
batch via process_events().
Args:
events: List of file events to buffer.
"""
with self._buffer_lock:
self._event_buffer.extend(events)
# Cancel previous timer and start a new one (true debounce)
if self._flush_timer is not None:
self._flush_timer.cancel()
self._flush_timer = threading.Timer(
self._debounce_window_ms / 1000.0,
self._flush_buffer,
)
self._flush_timer.daemon = True
self._flush_timer.start()
def _flush_buffer(self) -> None:
"""Flush the event buffer and process all accumulated events."""
with self._buffer_lock:
if not self._event_buffer:
return
events = list(self._event_buffer)
self._event_buffer.clear()
self._flush_timer = None
# Deduplicate: keep the last event per path
seen: dict[Path, FileEvent] = {}
for event in events:
seen[event.path] = event
deduped = list(seen.values())
logger.debug(
"Flushing debounce buffer: %d events (%d after dedup)",
len(events), len(deduped),
)
self.process_events(deduped)
def _handle_index(self, event: FileEvent, result: BatchResult) -> None:
"""Index a created or modified file."""
stats = self._pipeline.index_file(