mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-22 19:18:47 +08:00
feat: Enhance search functionality with quality tiers and scoped indexing
- Updated `search_code` function to include a `quality` parameter for search quality tiers: "fast", "balanced", "thorough", and "auto". - Introduced `search_scope` function to limit search results to a specific directory scope. - Added `index_scope` function for indexing a specific directory without re-indexing the entire project. - Refactored `SearchPipeline` to support quality-based routing in the `search` method. - Implemented `Shard` and `ShardManager` classes to manage multiple index shards with LRU eviction and efficient file routing. - Added debounce functionality in `IncrementalIndexer` to batch file events and reduce redundant processing. - Enhanced `FileWatcher` to integrate with `IncrementalIndexer` for improved event handling.
This commit is contained in:
@@ -124,6 +124,19 @@ def create_config_from_env(db_path: str | Path, **overrides: object) -> "Config"
|
||||
kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"])
|
||||
if os.environ.get("CODEXLENS_HNSW_M"):
|
||||
kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"])
|
||||
# Tier config from env
|
||||
if os.environ.get("CODEXLENS_TIER_HOT_HOURS"):
|
||||
kwargs["tier_hot_hours"] = int(os.environ["CODEXLENS_TIER_HOT_HOURS"])
|
||||
if os.environ.get("CODEXLENS_TIER_COLD_HOURS"):
|
||||
kwargs["tier_cold_hours"] = int(os.environ["CODEXLENS_TIER_COLD_HOURS"])
|
||||
# Search quality tier from env
|
||||
if os.environ.get("CODEXLENS_SEARCH_QUALITY"):
|
||||
kwargs["default_search_quality"] = os.environ["CODEXLENS_SEARCH_QUALITY"]
|
||||
# Shard config from env
|
||||
if os.environ.get("CODEXLENS_NUM_SHARDS"):
|
||||
kwargs["num_shards"] = int(os.environ["CODEXLENS_NUM_SHARDS"])
|
||||
if os.environ.get("CODEXLENS_MAX_LOADED_SHARDS"):
|
||||
kwargs["max_loaded_shards"] = int(os.environ["CODEXLENS_MAX_LOADED_SHARDS"])
|
||||
resolved = Path(db_path).resolve()
|
||||
kwargs["metadata_db_path"] = str(resolved / "metadata.db")
|
||||
return Config(**kwargs)
|
||||
@@ -143,28 +156,8 @@ def _create_config(args: argparse.Namespace) -> "Config":
|
||||
return create_config_from_env(args.db_path, **overrides)
|
||||
|
||||
|
||||
def create_pipeline(
|
||||
db_path: str | Path,
|
||||
config: "Config | None" = None,
|
||||
) -> tuple:
|
||||
"""Construct pipeline components from db_path and config.
|
||||
|
||||
Returns (indexing_pipeline, search_pipeline, config).
|
||||
Used by both CLI bridge and MCP server.
|
||||
"""
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
||||
from codexlens_search.indexing.metadata import MetadataStore
|
||||
from codexlens_search.indexing.pipeline import IndexingPipeline
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
from codexlens_search.search.pipeline import SearchPipeline
|
||||
|
||||
if config is None:
|
||||
config = create_config_from_env(db_path)
|
||||
resolved = Path(db_path).resolve()
|
||||
resolved.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Select embedder: API if configured, otherwise local fastembed
|
||||
def _create_embedder(config: "Config"):
|
||||
"""Create embedder based on config, auto-detecting embed_dim from API."""
|
||||
if config.embed_api_url:
|
||||
from codexlens_search.embed.api import APIEmbedder
|
||||
embedder = APIEmbedder(config)
|
||||
@@ -179,13 +172,11 @@ def create_pipeline(
|
||||
else:
|
||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
||||
embedder = FastEmbedEmbedder(config)
|
||||
return embedder
|
||||
|
||||
binary_store = create_binary_index(resolved, config.embed_dim, config)
|
||||
ann_index = create_ann_index(resolved, config.embed_dim, config)
|
||||
fts = FTSEngine(resolved / "fts.db")
|
||||
metadata = MetadataStore(resolved / "metadata.db")
|
||||
|
||||
# Select reranker: API if configured, otherwise local fastembed
|
||||
def _create_reranker(config: "Config"):
|
||||
"""Create reranker based on config."""
|
||||
if config.reranker_api_url:
|
||||
from codexlens_search.rerank.api import APIReranker
|
||||
reranker = APIReranker(config)
|
||||
@@ -193,6 +184,60 @@ def create_pipeline(
|
||||
else:
|
||||
from codexlens_search.rerank.local import FastEmbedReranker
|
||||
reranker = FastEmbedReranker(config)
|
||||
return reranker
|
||||
|
||||
|
||||
def create_pipeline(
|
||||
db_path: str | Path,
|
||||
config: "Config | None" = None,
|
||||
) -> tuple:
|
||||
"""Construct pipeline components from db_path and config.
|
||||
|
||||
Returns (indexing_pipeline, search_pipeline, config).
|
||||
Used by both CLI bridge and MCP server.
|
||||
|
||||
When config.num_shards > 1, returns a ShardManager-backed pipeline
|
||||
where indexing and search are delegated to the ShardManager.
|
||||
The returned tuple is (shard_manager, shard_manager, config) so that
|
||||
callers can use shard_manager.sync() and shard_manager.search().
|
||||
"""
|
||||
from codexlens_search.config import Config
|
||||
|
||||
if config is None:
|
||||
config = create_config_from_env(db_path)
|
||||
resolved = Path(db_path).resolve()
|
||||
resolved.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
embedder = _create_embedder(config)
|
||||
reranker = _create_reranker(config)
|
||||
|
||||
# Sharded mode: delegate to ShardManager
|
||||
if config.num_shards > 1:
|
||||
from codexlens_search.core.shard_manager import ShardManager
|
||||
manager = ShardManager(
|
||||
num_shards=config.num_shards,
|
||||
db_path=resolved,
|
||||
config=config,
|
||||
embedder=embedder,
|
||||
reranker=reranker,
|
||||
)
|
||||
log.info(
|
||||
"Using ShardManager with %d shards (max_loaded=%d)",
|
||||
config.num_shards, config.max_loaded_shards,
|
||||
)
|
||||
return manager, manager, config
|
||||
|
||||
# Single-shard mode: original behavior, no ShardManager overhead
|
||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
||||
from codexlens_search.indexing.metadata import MetadataStore
|
||||
from codexlens_search.indexing.pipeline import IndexingPipeline
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
from codexlens_search.search.pipeline import SearchPipeline
|
||||
|
||||
binary_store = create_binary_index(resolved, config.embed_dim, config)
|
||||
ann_index = create_ann_index(resolved, config.embed_dim, config)
|
||||
fts = FTSEngine(resolved / "fts.db")
|
||||
metadata = MetadataStore(resolved / "metadata.db")
|
||||
|
||||
indexing = IndexingPipeline(
|
||||
embedder=embedder,
|
||||
|
||||
@@ -47,7 +47,7 @@ class Config:
|
||||
|
||||
# Backend selection: 'auto', 'faiss', 'hnswlib'
|
||||
ann_backend: str = "auto"
|
||||
binary_backend: str = "auto"
|
||||
binary_backend: str = "faiss"
|
||||
|
||||
# Indexing pipeline
|
||||
index_workers: int = 2 # number of parallel indexing workers
|
||||
@@ -77,6 +77,17 @@ class Config:
|
||||
# Metadata store
|
||||
metadata_db_path: str = "" # empty = no metadata tracking
|
||||
|
||||
# Data tiering (hot/warm/cold)
|
||||
tier_hot_hours: int = 24 # files accessed within this window are 'hot'
|
||||
tier_cold_hours: int = 168 # files not accessed for this long are 'cold'
|
||||
|
||||
# Search quality tier: 'fast', 'balanced', 'thorough', 'auto'
|
||||
default_search_quality: str = "auto"
|
||||
|
||||
# Shard partitioning
|
||||
num_shards: int = 1 # 1 = single partition (no sharding), >1 = hash-based sharding
|
||||
max_loaded_shards: int = 4 # LRU limit for loaded shards in ShardManager
|
||||
|
||||
# FTS
|
||||
fts_top_k: int = 50
|
||||
|
||||
|
||||
@@ -15,6 +15,13 @@ logger = logging.getLogger(__name__)
|
||||
class BinaryStore(BaseBinaryIndex):
|
||||
"""Persistent binary vector store using numpy memmap.
|
||||
|
||||
.. deprecated::
|
||||
Prefer ``FAISSBinaryIndex`` for binary coarse search. This class is
|
||||
retained as a numpy-only fallback for environments where FAISS is not
|
||||
available. New code should use ``create_binary_index()`` from
|
||||
``codexlens_search.core.factory`` which selects the best backend
|
||||
automatically.
|
||||
|
||||
Stores binary-quantized float32 vectors as packed uint8 arrays on disk.
|
||||
Supports fast coarse search via XOR + popcount Hamming distance.
|
||||
"""
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from codexlens_search.config import Config
|
||||
@@ -97,14 +98,29 @@ def create_binary_index(
|
||||
backend = config.binary_backend
|
||||
|
||||
if backend == "faiss":
|
||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
||||
return FAISSBinaryIndex(path, dim, config)
|
||||
if _FAISS_AVAILABLE:
|
||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
||||
return FAISSBinaryIndex(path, dim, config)
|
||||
# FAISS explicitly requested but not installed: fall back with warning
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
warnings.warn(
|
||||
"binary_backend='faiss' but FAISS is not installed. "
|
||||
"Falling back to deprecated numpy BinaryStore. "
|
||||
"Install faiss-cpu or faiss-gpu for the recommended binary backend.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
logger.warning(
|
||||
"binary_backend='faiss' but FAISS not available, "
|
||||
"falling back to deprecated numpy BinaryStore."
|
||||
)
|
||||
return BinaryStore(path, dim, config)
|
||||
|
||||
if backend == "hnswlib":
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
return BinaryStore(path, dim, config)
|
||||
|
||||
# auto: try faiss first, then numpy-based BinaryStore
|
||||
# auto: try faiss first, then numpy-based BinaryStore (deprecated fallback)
|
||||
if _FAISS_AVAILABLE:
|
||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
||||
logger.info("Auto-selected FAISS binary backend")
|
||||
@@ -112,5 +128,14 @@ def create_binary_index(
|
||||
|
||||
# numpy BinaryStore is always available (no extra deps)
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
logger.info("Auto-selected numpy BinaryStore backend")
|
||||
warnings.warn(
|
||||
"Falling back to numpy BinaryStore because FAISS is not installed. "
|
||||
"BinaryStore is deprecated; install faiss-cpu or faiss-gpu for better performance.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
logger.warning(
|
||||
"FAISS not available, falling back to deprecated numpy BinaryStore. "
|
||||
"Install faiss-cpu or faiss-gpu for the recommended binary backend."
|
||||
)
|
||||
return BinaryStore(path, dim, config)
|
||||
|
||||
@@ -71,10 +71,23 @@ class FAISSANNIndex(BaseANNIndex):
|
||||
self.load()
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load index from disk or initialize a fresh one."""
|
||||
"""Load index from disk or initialize a fresh one.
|
||||
|
||||
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
|
||||
falling back to regular read_index() on older faiss versions.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._index_path.exists():
|
||||
idx = faiss.read_index(str(self._index_path))
|
||||
try:
|
||||
idx = faiss.read_index(
|
||||
str(self._index_path), faiss.IO_FLAG_MMAP
|
||||
)
|
||||
except (AttributeError, RuntimeError, Exception) as exc:
|
||||
logger.debug(
|
||||
"MMAP load failed, falling back to regular read: %s",
|
||||
exc,
|
||||
)
|
||||
idx = faiss.read_index(str(self._index_path))
|
||||
logger.debug(
|
||||
"Loaded FAISS ANN index from %s (%d items)",
|
||||
self._index_path, idx.ntotal,
|
||||
@@ -201,10 +214,23 @@ class FAISSBinaryIndex(BaseBinaryIndex):
|
||||
return np.packbits(binary).reshape(1, -1)
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load binary index from disk or initialize a fresh one."""
|
||||
"""Load binary index from disk or initialize a fresh one.
|
||||
|
||||
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
|
||||
falling back to regular read_index_binary() on older faiss versions.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._index_path.exists():
|
||||
idx = faiss.read_index_binary(str(self._index_path))
|
||||
try:
|
||||
idx = faiss.read_index_binary(
|
||||
str(self._index_path), faiss.IO_FLAG_MMAP
|
||||
)
|
||||
except (AttributeError, RuntimeError, Exception) as exc:
|
||||
logger.debug(
|
||||
"MMAP load failed, falling back to regular read: %s",
|
||||
exc,
|
||||
)
|
||||
idx = faiss.read_index_binary(str(self._index_path))
|
||||
logger.debug(
|
||||
"Loaded FAISS binary index from %s (%d items)",
|
||||
self._index_path, idx.ntotal,
|
||||
|
||||
178
codex-lens-v2/src/codexlens_search/core/shard.py
Normal file
178
codex-lens-v2/src/codexlens_search/core/shard.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""Single index partition (shard) that owns FTS, binary, ANN, and metadata stores."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
||||
from codexlens_search.embed.base import BaseEmbedder
|
||||
from codexlens_search.indexing.metadata import MetadataStore
|
||||
from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats
|
||||
from codexlens_search.rerank import BaseReranker
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Shard:
|
||||
"""A complete index partition with its own FTS, binary, ANN, and metadata stores.
|
||||
|
||||
Components are lazy-loaded on first access and can be explicitly unloaded
|
||||
to release memory. The embedder and reranker are shared across shards
|
||||
(passed in from ShardManager) since they are expensive to instantiate.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
shard_id: int,
|
||||
db_path: str | Path,
|
||||
config: Config,
|
||||
) -> None:
|
||||
self._shard_id = shard_id
|
||||
self._shard_dir = Path(db_path).resolve() / f"shard_{shard_id}"
|
||||
self._config = config
|
||||
|
||||
# Lazy-loaded components (created on _ensure_loaded)
|
||||
self._fts: FTSEngine | None = None
|
||||
self._binary_store: BaseBinaryIndex | None = None
|
||||
self._ann_index: BaseANNIndex | None = None
|
||||
self._metadata: MetadataStore | None = None
|
||||
self._indexing: IndexingPipeline | None = None
|
||||
self._search: SearchPipeline | None = None
|
||||
self._loaded = False
|
||||
|
||||
@property
|
||||
def shard_id(self) -> int:
|
||||
return self._shard_id
|
||||
|
||||
@property
|
||||
def is_loaded(self) -> bool:
|
||||
return self._loaded
|
||||
|
||||
def _ensure_loaded(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
reranker: BaseReranker,
|
||||
) -> None:
|
||||
"""Lazy-create all per-shard components if not yet loaded."""
|
||||
if self._loaded:
|
||||
return
|
||||
|
||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
||||
|
||||
self._shard_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._fts = FTSEngine(self._shard_dir / "fts.db")
|
||||
self._binary_store = create_binary_index(
|
||||
self._shard_dir, self._config.embed_dim, self._config
|
||||
)
|
||||
self._ann_index = create_ann_index(
|
||||
self._shard_dir, self._config.embed_dim, self._config
|
||||
)
|
||||
self._metadata = MetadataStore(self._shard_dir / "metadata.db")
|
||||
|
||||
self._indexing = IndexingPipeline(
|
||||
embedder=embedder,
|
||||
binary_store=self._binary_store,
|
||||
ann_index=self._ann_index,
|
||||
fts=self._fts,
|
||||
config=self._config,
|
||||
metadata=self._metadata,
|
||||
)
|
||||
|
||||
self._search = SearchPipeline(
|
||||
embedder=embedder,
|
||||
binary_store=self._binary_store,
|
||||
ann_index=self._ann_index,
|
||||
reranker=reranker,
|
||||
fts=self._fts,
|
||||
config=self._config,
|
||||
metadata_store=self._metadata,
|
||||
)
|
||||
|
||||
self._loaded = True
|
||||
logger.debug("Shard %d loaded from %s", self._shard_id, self._shard_dir)
|
||||
|
||||
def unload(self) -> None:
|
||||
"""Release memory by closing connections and dropping references."""
|
||||
if not self._loaded:
|
||||
return
|
||||
|
||||
if self._metadata is not None:
|
||||
self._metadata.close()
|
||||
|
||||
self._fts = None
|
||||
self._binary_store = None
|
||||
self._ann_index = None
|
||||
self._metadata = None
|
||||
self._indexing = None
|
||||
self._search = None
|
||||
self._loaded = False
|
||||
logger.debug("Shard %d unloaded", self._shard_id)
|
||||
|
||||
def load(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
reranker: BaseReranker,
|
||||
) -> None:
|
||||
"""Explicitly load shard components."""
|
||||
self._ensure_loaded(embedder, reranker)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Persist binary and ANN indexes to disk."""
|
||||
if not self._loaded:
|
||||
return
|
||||
if self._binary_store is not None:
|
||||
self._binary_store.save()
|
||||
if self._ann_index is not None:
|
||||
self._ann_index.save()
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
embedder: BaseEmbedder,
|
||||
reranker: BaseReranker,
|
||||
quality: str | None = None,
|
||||
top_k: int | None = None,
|
||||
) -> list[SearchResult]:
|
||||
"""Search this shard's index.
|
||||
|
||||
Args:
|
||||
query: Search query string.
|
||||
embedder: Shared embedder instance.
|
||||
reranker: Shared reranker instance.
|
||||
quality: Search quality tier.
|
||||
top_k: Maximum results to return.
|
||||
|
||||
Returns:
|
||||
List of SearchResult from this shard.
|
||||
"""
|
||||
self._ensure_loaded(embedder, reranker)
|
||||
assert self._search is not None
|
||||
return self._search.search(query, top_k=top_k, quality=quality)
|
||||
|
||||
def sync(
|
||||
self,
|
||||
files: list[Path],
|
||||
root: Path | None,
|
||||
embedder: BaseEmbedder,
|
||||
reranker: BaseReranker,
|
||||
**kwargs: object,
|
||||
) -> IndexStats:
|
||||
"""Sync this shard's index with the given files.
|
||||
|
||||
Args:
|
||||
files: Files that belong to this shard.
|
||||
root: Root directory for relative paths.
|
||||
embedder: Shared embedder instance.
|
||||
reranker: Shared reranker instance.
|
||||
**kwargs: Forwarded to IndexingPipeline.sync().
|
||||
|
||||
Returns:
|
||||
IndexStats for this shard's sync operation.
|
||||
"""
|
||||
self._ensure_loaded(embedder, reranker)
|
||||
assert self._indexing is not None
|
||||
return self._indexing.sync(files, root=root, **kwargs)
|
||||
250
codex-lens-v2/src/codexlens_search/core/shard_manager.py
Normal file
250
codex-lens-v2/src/codexlens_search/core/shard_manager.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""ShardManager: manages multiple Shard instances with LRU eviction."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from collections import OrderedDict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.shard import Shard
|
||||
from codexlens_search.embed.base import BaseEmbedder
|
||||
from codexlens_search.indexing.pipeline import IndexStats
|
||||
from codexlens_search.rerank import BaseReranker
|
||||
from codexlens_search.search.fusion import reciprocal_rank_fusion
|
||||
from codexlens_search.search.pipeline import SearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ShardManager:
|
||||
"""Manages multiple Shard instances with hash-based file routing and LRU eviction.
|
||||
|
||||
Files are deterministically routed to shards via hash(path) % num_shards.
|
||||
Search queries all shards in parallel and merges results via RRF fusion.
|
||||
At most max_loaded_shards are kept in memory; least-recently-used shards
|
||||
are unloaded when the limit is exceeded.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_shards: int,
|
||||
db_path: str | Path,
|
||||
config: Config,
|
||||
embedder: BaseEmbedder,
|
||||
reranker: BaseReranker,
|
||||
) -> None:
|
||||
if num_shards < 1:
|
||||
raise ValueError("num_shards must be >= 1")
|
||||
|
||||
self._num_shards = num_shards
|
||||
self._db_path = Path(db_path).resolve()
|
||||
self._config = config
|
||||
self._embedder = embedder
|
||||
self._reranker = reranker
|
||||
self._max_loaded = config.max_loaded_shards
|
||||
|
||||
# Create all Shard objects (lazy-loaded, no I/O yet)
|
||||
self._shards: dict[int, Shard] = {
|
||||
i: Shard(i, self._db_path, config)
|
||||
for i in range(num_shards)
|
||||
}
|
||||
|
||||
# LRU tracking: keys are shard_ids, most-recently-used at end
|
||||
self._loaded_order: OrderedDict[int, None] = OrderedDict()
|
||||
self._lru_lock = threading.Lock()
|
||||
|
||||
@property
|
||||
def num_shards(self) -> int:
|
||||
return self._num_shards
|
||||
|
||||
def route_file(self, path: str) -> int:
|
||||
"""Deterministically route a file path to a shard ID.
|
||||
|
||||
Uses hash(path) % num_shards for uniform distribution.
|
||||
"""
|
||||
return hash(path) % self._num_shards
|
||||
|
||||
def get_shard(self, shard_id: int) -> Shard:
|
||||
"""Return the Shard instance for a given shard_id."""
|
||||
if shard_id not in self._shards:
|
||||
raise ValueError(
|
||||
f"Invalid shard_id {shard_id}, valid range: 0-{self._num_shards - 1}"
|
||||
)
|
||||
return self._shards[shard_id]
|
||||
|
||||
def _ensure_loaded(self, shard_id: int) -> Shard:
|
||||
"""Load a shard if needed, applying LRU eviction policy.
|
||||
|
||||
Thread-safe: protects OrderedDict mutations with a lock.
|
||||
Returns the loaded Shard.
|
||||
"""
|
||||
shard = self._shards[shard_id]
|
||||
|
||||
with self._lru_lock:
|
||||
# Mark as most-recently-used
|
||||
if shard_id in self._loaded_order:
|
||||
self._loaded_order.move_to_end(shard_id)
|
||||
else:
|
||||
self._loaded_order[shard_id] = None
|
||||
|
||||
# Load if not already loaded
|
||||
if not shard.is_loaded:
|
||||
shard.load(self._embedder, self._reranker)
|
||||
|
||||
# Evict LRU shards if over limit
|
||||
while len(self._loaded_order) > self._max_loaded:
|
||||
evict_id, _ = self._loaded_order.popitem(last=False)
|
||||
evict_shard = self._shards[evict_id]
|
||||
if evict_shard.is_loaded:
|
||||
logger.info("LRU evicting shard %d", evict_id)
|
||||
evict_shard.unload()
|
||||
|
||||
return shard
|
||||
|
||||
def sync(
|
||||
self,
|
||||
files: list[Path],
|
||||
root: Path | None = None,
|
||||
**kwargs: object,
|
||||
) -> IndexStats:
|
||||
"""Sync index with files, routing each file to its shard.
|
||||
|
||||
Groups files by shard via route_file(), then syncs each shard
|
||||
with its subset of files.
|
||||
|
||||
Args:
|
||||
files: Current list of files to index.
|
||||
root: Root directory for relative paths.
|
||||
**kwargs: Forwarded to Shard.sync().
|
||||
|
||||
Returns:
|
||||
Aggregated IndexStats across all shards.
|
||||
"""
|
||||
# Group files by shard
|
||||
shard_files: dict[int, list[Path]] = {i: [] for i in range(self._num_shards)}
|
||||
for fpath in files:
|
||||
rel = str(fpath.relative_to(root)) if root else str(fpath)
|
||||
shard_id = self.route_file(rel)
|
||||
shard_files[shard_id].append(fpath)
|
||||
|
||||
total_files = 0
|
||||
total_chunks = 0
|
||||
total_duration = 0.0
|
||||
|
||||
for shard_id, shard_file_list in shard_files.items():
|
||||
if not shard_file_list:
|
||||
continue
|
||||
self._ensure_loaded(shard_id)
|
||||
shard = self._shards[shard_id]
|
||||
stats = shard.sync(
|
||||
shard_file_list,
|
||||
root=root,
|
||||
embedder=self._embedder,
|
||||
reranker=self._reranker,
|
||||
**kwargs,
|
||||
)
|
||||
total_files += stats.files_processed
|
||||
total_chunks += stats.chunks_created
|
||||
total_duration += stats.duration_seconds
|
||||
|
||||
return IndexStats(
|
||||
files_processed=total_files,
|
||||
chunks_created=total_chunks,
|
||||
duration_seconds=round(total_duration, 2),
|
||||
)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
quality: str | None = None,
|
||||
top_k: int | None = None,
|
||||
) -> list[SearchResult]:
|
||||
"""Search all shards in parallel, merge results via RRF fusion.
|
||||
|
||||
Each shard returns its own ranked results. Cross-shard merging
|
||||
uses reciprocal_rank_fusion with equal weights across shards.
|
||||
Per-shard top_k is increased to compensate for cross-shard dilution.
|
||||
|
||||
Args:
|
||||
query: Search query string.
|
||||
quality: Search quality tier.
|
||||
top_k: Maximum final results to return.
|
||||
|
||||
Returns:
|
||||
Merged list of SearchResult ordered by relevance.
|
||||
"""
|
||||
cfg = self._config
|
||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
||||
|
||||
# Increase per-shard top_k to get enough candidates for cross-shard RRF
|
||||
per_shard_top_k = max(final_top_k, final_top_k * 2)
|
||||
|
||||
# Load all shards for search
|
||||
for shard_id in range(self._num_shards):
|
||||
self._ensure_loaded(shard_id)
|
||||
|
||||
# Parallel search across shards
|
||||
shard_results: dict[int, list[SearchResult]] = {}
|
||||
|
||||
def _search_shard(sid: int) -> tuple[int, list[SearchResult]]:
|
||||
shard = self._shards[sid]
|
||||
results = shard.search(
|
||||
query,
|
||||
embedder=self._embedder,
|
||||
reranker=self._reranker,
|
||||
quality=quality,
|
||||
top_k=per_shard_top_k,
|
||||
)
|
||||
return sid, results
|
||||
|
||||
with ThreadPoolExecutor(max_workers=min(self._num_shards, 4)) as pool:
|
||||
futures = [pool.submit(_search_shard, sid) for sid in range(self._num_shards)]
|
||||
for future in futures:
|
||||
try:
|
||||
sid, results = future.result()
|
||||
shard_results[sid] = results
|
||||
except Exception:
|
||||
logger.warning("Shard search failed", exc_info=True)
|
||||
|
||||
# If only one shard returned results, no merging needed
|
||||
non_empty = {k: v for k, v in shard_results.items() if v}
|
||||
if not non_empty:
|
||||
return []
|
||||
if len(non_empty) == 1:
|
||||
results = list(non_empty.values())[0]
|
||||
return results[:final_top_k]
|
||||
|
||||
# Cross-shard RRF merge
|
||||
# Build ranked lists keyed by shard name, with (doc_id, score) tuples
|
||||
# Use a global result map to look up SearchResult by a unique key
|
||||
# Since doc_ids are shard-local, we need a composite key
|
||||
rrf_input: dict[str, list[tuple[int, float]]] = {}
|
||||
global_results: dict[int, SearchResult] = {}
|
||||
global_id = 0
|
||||
|
||||
for sid, results in non_empty.items():
|
||||
ranked: list[tuple[int, float]] = []
|
||||
for r in results:
|
||||
global_results[global_id] = r
|
||||
ranked.append((global_id, r.score))
|
||||
global_id += 1
|
||||
rrf_input[f"shard_{sid}"] = ranked
|
||||
|
||||
fused = reciprocal_rank_fusion(rrf_input, k=cfg.fusion_k)
|
||||
|
||||
merged: list[SearchResult] = []
|
||||
for gid, fused_score in fused[:final_top_k]:
|
||||
result = global_results[gid]
|
||||
merged.append(SearchResult(
|
||||
id=result.id,
|
||||
path=result.path,
|
||||
score=fused_score,
|
||||
snippet=result.snippet,
|
||||
line=result.line,
|
||||
end_line=result.end_line,
|
||||
content=result.content,
|
||||
))
|
||||
|
||||
return merged
|
||||
@@ -2,6 +2,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -9,7 +10,8 @@ class MetadataStore:
|
||||
"""Tracks file-to-chunk mappings and deleted chunk IDs (tombstones).
|
||||
|
||||
Tables:
|
||||
files - file_path (PK), content_hash, last_modified
|
||||
files - file_path (PK), content_hash, last_modified, file_size,
|
||||
tier ('hot'/'warm'/'cold'), last_accessed (epoch float)
|
||||
chunks - chunk_id (PK), file_path (FK CASCADE), chunk_hash
|
||||
deleted_chunks - chunk_id (PK) for tombstone tracking
|
||||
"""
|
||||
@@ -19,13 +21,18 @@ class MetadataStore:
|
||||
self._conn.execute("PRAGMA foreign_keys = ON")
|
||||
self._conn.execute("PRAGMA journal_mode = WAL")
|
||||
self._create_tables()
|
||||
self._migrate_size_column()
|
||||
self._migrate_tier_columns()
|
||||
|
||||
def _create_tables(self) -> None:
|
||||
self._conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
file_path TEXT PRIMARY KEY,
|
||||
content_hash TEXT NOT NULL,
|
||||
last_modified REAL NOT NULL
|
||||
last_modified REAL NOT NULL,
|
||||
file_size INTEGER NOT NULL DEFAULT 0,
|
||||
tier TEXT NOT NULL DEFAULT 'warm',
|
||||
last_accessed REAL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS chunks (
|
||||
@@ -41,14 +48,48 @@ class MetadataStore:
|
||||
""")
|
||||
self._conn.commit()
|
||||
|
||||
def _migrate_size_column(self) -> None:
|
||||
"""Add file_size column if missing (for pre-existing DBs)."""
|
||||
cols = {
|
||||
row[1]
|
||||
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
|
||||
}
|
||||
if "file_size" not in cols:
|
||||
self._conn.execute(
|
||||
"ALTER TABLE files ADD COLUMN file_size INTEGER NOT NULL DEFAULT 0"
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def _migrate_tier_columns(self) -> None:
|
||||
"""Add tier and last_accessed columns if missing (for pre-existing DBs)."""
|
||||
cols = {
|
||||
row[1]
|
||||
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
|
||||
}
|
||||
if "tier" not in cols:
|
||||
self._conn.execute(
|
||||
"ALTER TABLE files ADD COLUMN tier TEXT NOT NULL DEFAULT 'warm'"
|
||||
)
|
||||
if "last_accessed" not in cols:
|
||||
self._conn.execute(
|
||||
"ALTER TABLE files ADD COLUMN last_accessed REAL"
|
||||
)
|
||||
if "tier" not in cols or "last_accessed" not in cols:
|
||||
self._conn.commit()
|
||||
|
||||
def register_file(
|
||||
self, file_path: str, content_hash: str, mtime: float
|
||||
self,
|
||||
file_path: str,
|
||||
content_hash: str,
|
||||
mtime: float,
|
||||
file_size: int = 0,
|
||||
) -> None:
|
||||
"""Insert or update a file record."""
|
||||
self._conn.execute(
|
||||
"INSERT OR REPLACE INTO files (file_path, content_hash, last_modified) "
|
||||
"VALUES (?, ?, ?)",
|
||||
(file_path, content_hash, mtime),
|
||||
"INSERT OR REPLACE INTO files "
|
||||
"(file_path, content_hash, last_modified, file_size) "
|
||||
"VALUES (?, ?, ?, ?)",
|
||||
(file_path, content_hash, mtime, file_size),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
@@ -121,6 +162,24 @@ class MetadataStore:
|
||||
return True # New file
|
||||
return stored != content_hash
|
||||
|
||||
def file_needs_update_fast(
|
||||
self, file_path: str, mtime: float, size: int
|
||||
) -> bool:
|
||||
"""Fast pre-check using mtime and file size (no content read needed).
|
||||
|
||||
Returns True if the file appears changed or is not yet tracked.
|
||||
When mtime and size both match stored values, the file is assumed
|
||||
unchanged (~1000x faster than content-hash comparison).
|
||||
"""
|
||||
row = self._conn.execute(
|
||||
"SELECT last_modified, file_size FROM files WHERE file_path = ?",
|
||||
(file_path,),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return True # New file
|
||||
stored_mtime, stored_size = row
|
||||
return stored_mtime != mtime or stored_size != size
|
||||
|
||||
def compact_deleted(self) -> set[int]:
|
||||
"""Return deleted IDs and clear the deleted_chunks table.
|
||||
|
||||
@@ -161,5 +220,81 @@ class MetadataStore:
|
||||
).fetchone()
|
||||
return row[0] if row[0] is not None else -1
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tier management
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def record_access(self, file_path: str) -> None:
|
||||
"""Update last_accessed timestamp for a file."""
|
||||
self._conn.execute(
|
||||
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
|
||||
(time.time(), file_path),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def record_access_batch(self, file_paths: list[str]) -> None:
|
||||
"""Batch-update last_accessed timestamps for multiple files."""
|
||||
if not file_paths:
|
||||
return
|
||||
now = time.time()
|
||||
self._conn.executemany(
|
||||
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
|
||||
[(now, fp) for fp in file_paths],
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def classify_tiers(
|
||||
self, hot_threshold_hours: int = 24, cold_threshold_hours: int = 168
|
||||
) -> None:
|
||||
"""Reclassify all files into hot/warm/cold tiers based on last_accessed.
|
||||
|
||||
- hot: last_accessed within hot_threshold_hours
|
||||
- cold: last_accessed older than cold_threshold_hours (or never accessed)
|
||||
- warm: everything in between
|
||||
"""
|
||||
now = time.time()
|
||||
hot_cutoff = now - hot_threshold_hours * 3600
|
||||
cold_cutoff = now - cold_threshold_hours * 3600
|
||||
|
||||
# Hot: recently accessed
|
||||
self._conn.execute(
|
||||
"UPDATE files SET tier = 'hot' "
|
||||
"WHERE last_accessed IS NOT NULL AND last_accessed >= ?",
|
||||
(hot_cutoff,),
|
||||
)
|
||||
# Cold: not accessed for a long time, or never accessed
|
||||
self._conn.execute(
|
||||
"UPDATE files SET tier = 'cold' "
|
||||
"WHERE last_accessed IS NULL "
|
||||
"OR (last_accessed < ? AND last_accessed < ?)",
|
||||
(cold_cutoff, hot_cutoff),
|
||||
)
|
||||
# Warm: between hot and cold cutoffs
|
||||
self._conn.execute(
|
||||
"UPDATE files SET tier = 'warm' "
|
||||
"WHERE last_accessed IS NOT NULL "
|
||||
"AND last_accessed >= ? AND last_accessed < ?",
|
||||
(cold_cutoff, hot_cutoff),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def get_files_by_tier(self, tier: str) -> list[str]:
|
||||
"""Return file paths in the specified tier ('hot', 'warm', or 'cold')."""
|
||||
rows = self._conn.execute(
|
||||
"SELECT file_path FROM files WHERE tier = ?", (tier,)
|
||||
).fetchall()
|
||||
return [r[0] for r in rows]
|
||||
|
||||
def get_cold_files(self) -> list[str]:
|
||||
"""Return file paths in the 'cold' tier."""
|
||||
return self.get_files_by_tier("cold")
|
||||
|
||||
def get_file_tier(self, file_path: str) -> str | None:
|
||||
"""Return the tier for a specific file, or None if not tracked."""
|
||||
row = self._conn.execute(
|
||||
"SELECT tier FROM files WHERE file_path = ?", (file_path,)
|
||||
).fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def close(self) -> None:
|
||||
self._conn.close()
|
||||
|
||||
@@ -17,8 +17,7 @@ from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
from codexlens_search.core.index import ANNIndex
|
||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
||||
from codexlens_search.embed.base import BaseEmbedder
|
||||
from codexlens_search.indexing.metadata import MetadataStore
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
@@ -100,8 +99,8 @@ class IndexingPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
binary_store: BinaryStore,
|
||||
ann_index: ANNIndex,
|
||||
binary_store: BaseBinaryIndex,
|
||||
ann_index: BaseANNIndex,
|
||||
fts: FTSEngine,
|
||||
config: Config,
|
||||
metadata: MetadataStore | None = None,
|
||||
@@ -463,6 +462,94 @@ class IndexingPipeline:
|
||||
meta = self._require_metadata()
|
||||
return meta.max_chunk_id() + 1
|
||||
|
||||
def index_files_fts_only(
|
||||
self,
|
||||
files: list[Path],
|
||||
*,
|
||||
root: Path | None = None,
|
||||
max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
|
||||
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
||||
) -> IndexStats:
|
||||
"""Index files into FTS5 only, without embedding or vector indexing.
|
||||
|
||||
Chunks files using the same logic as the full pipeline, then inserts
|
||||
directly into FTS. No embedding computation, no binary/ANN store writes.
|
||||
|
||||
Args:
|
||||
files: List of file paths to index.
|
||||
root: Optional root for computing relative paths.
|
||||
max_chunk_chars: Maximum characters per chunk.
|
||||
chunk_overlap: Character overlap between consecutive chunks.
|
||||
|
||||
Returns:
|
||||
IndexStats with counts and timing.
|
||||
"""
|
||||
if not files:
|
||||
return IndexStats()
|
||||
|
||||
meta = self._require_metadata()
|
||||
t0 = time.monotonic()
|
||||
chunk_id = self._next_chunk_id()
|
||||
files_processed = 0
|
||||
chunks_created = 0
|
||||
|
||||
for fpath in files:
|
||||
exclude_reason = is_file_excluded(fpath, self._config)
|
||||
if exclude_reason:
|
||||
logger.debug("Skipping %s: %s", fpath, exclude_reason)
|
||||
continue
|
||||
try:
|
||||
text = fpath.read_text(encoding="utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
logger.debug("Skipping %s: %s", fpath, exc)
|
||||
continue
|
||||
|
||||
rel_path = str(fpath.relative_to(root)) if root else str(fpath)
|
||||
content_hash = self._content_hash(text)
|
||||
|
||||
# Skip unchanged files
|
||||
if not meta.file_needs_update(rel_path, content_hash):
|
||||
continue
|
||||
|
||||
# Remove old FTS data if file was previously indexed
|
||||
if meta.get_file_hash(rel_path) is not None:
|
||||
meta.mark_file_deleted(rel_path)
|
||||
self._fts.delete_by_path(rel_path)
|
||||
|
||||
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
|
||||
if not file_chunks:
|
||||
st = fpath.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
continue
|
||||
|
||||
files_processed += 1
|
||||
fts_docs = []
|
||||
chunk_id_hashes = []
|
||||
for chunk_text, path, sl, el in file_chunks:
|
||||
fts_docs.append((chunk_id, path, chunk_text, sl, el))
|
||||
chunk_id_hashes.append((chunk_id, self._content_hash(chunk_text)))
|
||||
chunk_id += 1
|
||||
|
||||
self._fts.add_documents(fts_docs)
|
||||
chunks_created += len(fts_docs)
|
||||
|
||||
# Register metadata
|
||||
st = fpath.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
meta.register_chunks(rel_path, chunk_id_hashes)
|
||||
|
||||
duration = time.monotonic() - t0
|
||||
stats = IndexStats(
|
||||
files_processed=files_processed,
|
||||
chunks_created=chunks_created,
|
||||
duration_seconds=round(duration, 2),
|
||||
)
|
||||
logger.info(
|
||||
"FTS-only indexing complete: %d files, %d chunks in %.1fs",
|
||||
stats.files_processed, stats.chunks_created, stats.duration_seconds,
|
||||
)
|
||||
return stats
|
||||
|
||||
def index_file(
|
||||
self,
|
||||
file_path: Path,
|
||||
@@ -522,7 +609,8 @@ class IndexingPipeline:
|
||||
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
|
||||
if not file_chunks:
|
||||
# Register file with no chunks
|
||||
meta.register_file(rel_path, content_hash, file_path.stat().st_mtime)
|
||||
st = file_path.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
return IndexStats(
|
||||
files_processed=1,
|
||||
duration_seconds=round(time.monotonic() - t0, 2),
|
||||
@@ -556,7 +644,8 @@ class IndexingPipeline:
|
||||
self._fts.add_documents(fts_docs)
|
||||
|
||||
# Register in metadata
|
||||
meta.register_file(rel_path, content_hash, file_path.stat().st_mtime)
|
||||
st = file_path.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
chunk_id_hashes = [
|
||||
(batch_ids[i], self._content_hash(batch_texts[i]))
|
||||
for i in range(len(batch_ids))
|
||||
@@ -605,6 +694,7 @@ class IndexingPipeline:
|
||||
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
||||
max_file_size: int = 50_000,
|
||||
progress_callback: callable | None = None,
|
||||
tier: str = "full",
|
||||
) -> IndexStats:
|
||||
"""Reconcile index state against a current file list.
|
||||
|
||||
@@ -617,6 +707,9 @@ class IndexingPipeline:
|
||||
max_chunk_chars: Maximum characters per chunk.
|
||||
chunk_overlap: Character overlap between consecutive chunks.
|
||||
max_file_size: Skip files larger than this (bytes).
|
||||
tier: Indexing tier - 'full' (default) runs the full pipeline
|
||||
with embedding, 'fts_only' runs FTS-only indexing without
|
||||
embedding or vector stores.
|
||||
|
||||
Returns:
|
||||
Aggregated IndexStats for all operations.
|
||||
@@ -638,33 +731,72 @@ class IndexingPipeline:
|
||||
for rel in removed:
|
||||
self.remove_file(rel)
|
||||
|
||||
# Collect files needing update
|
||||
# Collect files needing update using 4-level detection:
|
||||
# Level 1: set diff (removed files) - handled above
|
||||
# Level 2: mtime + size fast pre-check via stat()
|
||||
# Level 3: content hash only when mtime/size mismatch
|
||||
files_to_index: list[Path] = []
|
||||
for rel, fpath in current_rel_paths.items():
|
||||
# Level 2: stat-based fast check
|
||||
try:
|
||||
st = fpath.stat()
|
||||
except OSError:
|
||||
continue
|
||||
if not meta.file_needs_update_fast(rel, st.st_mtime, st.st_size):
|
||||
# mtime + size match stored values -> skip (no read needed)
|
||||
continue
|
||||
|
||||
# Level 3: mtime/size changed -> verify with content hash
|
||||
try:
|
||||
text = fpath.read_text(encoding="utf-8", errors="replace")
|
||||
except Exception:
|
||||
continue
|
||||
content_hash = self._content_hash(text)
|
||||
if meta.file_needs_update(rel, content_hash):
|
||||
# Remove old data if previously indexed
|
||||
if meta.get_file_hash(rel) is not None:
|
||||
meta.mark_file_deleted(rel)
|
||||
self._fts.delete_by_path(rel)
|
||||
files_to_index.append(fpath)
|
||||
if not meta.file_needs_update(rel, content_hash):
|
||||
# Content unchanged despite mtime/size change -> update metadata only
|
||||
meta.register_file(rel, content_hash, st.st_mtime, st.st_size)
|
||||
continue
|
||||
|
||||
# Batch index via parallel pipeline
|
||||
# File genuinely changed -> remove old data and queue for re-index
|
||||
if meta.get_file_hash(rel) is not None:
|
||||
meta.mark_file_deleted(rel)
|
||||
self._fts.delete_by_path(rel)
|
||||
files_to_index.append(fpath)
|
||||
|
||||
# Sort files by data tier priority: hot first, then warm, then cold
|
||||
if files_to_index:
|
||||
# Set starting chunk ID from metadata
|
||||
start_id = self._next_chunk_id()
|
||||
batch_stats = self._index_files_with_metadata(
|
||||
files_to_index,
|
||||
root=root,
|
||||
max_chunk_chars=max_chunk_chars,
|
||||
chunk_overlap=chunk_overlap,
|
||||
start_chunk_id=start_id,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
_tier_priority = {"hot": 0, "warm": 1, "cold": 2}
|
||||
def _tier_sort_key(fp: Path) -> int:
|
||||
rel = str(fp.relative_to(root)) if root else str(fp)
|
||||
t = meta.get_file_tier(rel)
|
||||
return _tier_priority.get(t or "warm", 1)
|
||||
files_to_index.sort(key=_tier_sort_key)
|
||||
|
||||
# Reclassify data tiers after sync detection
|
||||
meta.classify_tiers(
|
||||
self._config.tier_hot_hours, self._config.tier_cold_hours
|
||||
)
|
||||
|
||||
# Batch index via parallel pipeline or FTS-only
|
||||
if files_to_index:
|
||||
if tier == "fts_only":
|
||||
batch_stats = self.index_files_fts_only(
|
||||
files_to_index,
|
||||
root=root,
|
||||
max_chunk_chars=max_chunk_chars,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
else:
|
||||
# Full pipeline with embedding
|
||||
start_id = self._next_chunk_id()
|
||||
batch_stats = self._index_files_with_metadata(
|
||||
files_to_index,
|
||||
root=root,
|
||||
max_chunk_chars=max_chunk_chars,
|
||||
chunk_overlap=chunk_overlap,
|
||||
start_chunk_id=start_id,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
total_files = batch_stats.files_processed
|
||||
total_chunks = batch_stats.chunks_created
|
||||
else:
|
||||
@@ -781,7 +913,8 @@ class IndexingPipeline:
|
||||
file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap)
|
||||
|
||||
if not file_chunks:
|
||||
meta.register_file(rel_path, content_hash, fpath.stat().st_mtime)
|
||||
st = fpath.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
continue
|
||||
|
||||
files_processed += 1
|
||||
@@ -806,7 +939,8 @@ class IndexingPipeline:
|
||||
chunks_created += len(file_chunk_ids)
|
||||
|
||||
# Register metadata per file
|
||||
meta.register_file(rel_path, content_hash, fpath.stat().st_mtime)
|
||||
st = fpath.stat()
|
||||
meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size)
|
||||
chunk_id_hashes = [
|
||||
(cid, self._content_hash(ct)) for cid, ct in file_chunk_ids
|
||||
]
|
||||
|
||||
@@ -102,13 +102,20 @@ def _get_pipelines(project_path: str) -> tuple:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def search_code(project_path: str, query: str, top_k: int = 10) -> str:
|
||||
def search_code(
|
||||
project_path: str, query: str, top_k: int = 10, quality: str = "auto"
|
||||
) -> str:
|
||||
"""Semantic code search with hybrid fusion (vector + FTS + reranking).
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
query: Natural language or code search query.
|
||||
top_k: Maximum number of results to return (default 10).
|
||||
quality: Search quality tier (default "auto"):
|
||||
- "fast": FTS-only + rerank (no embedding needed, fastest)
|
||||
- "balanced": FTS + binary coarse search + rerank
|
||||
- "thorough": Full 2-stage vector + FTS + reranking (best quality)
|
||||
- "auto": Uses "thorough" if vector index exists, else "fast"
|
||||
|
||||
Returns:
|
||||
Search results as formatted text with file paths, line numbers, scores, and code snippets.
|
||||
@@ -121,15 +128,75 @@ def search_code(project_path: str, query: str, top_k: int = 10) -> str:
|
||||
if not (db_path / "metadata.db").exists():
|
||||
return f"Error: no index found at {db_path}. Run index_project first."
|
||||
|
||||
valid_qualities = ("fast", "balanced", "thorough", "auto")
|
||||
if quality not in valid_qualities:
|
||||
return f"Error: invalid quality '{quality}'. Must be one of: {', '.join(valid_qualities)}"
|
||||
|
||||
_, search, _ = _get_pipelines(project_path)
|
||||
results = search.search(query, top_k=top_k)
|
||||
results = search.search(query, top_k=top_k, quality=quality)
|
||||
|
||||
if not results:
|
||||
return "No results found."
|
||||
|
||||
lines = []
|
||||
for i, r in enumerate(results, 1):
|
||||
lines.append(f"## Result {i} — {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
||||
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
||||
lines.append(f"```\n{r.content}\n```")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def search_scope(
|
||||
project_path: str,
|
||||
query: str,
|
||||
scope_path: str,
|
||||
top_k: int = 10,
|
||||
quality: str = "auto",
|
||||
) -> str:
|
||||
"""Search within a specific directory scope of a project.
|
||||
|
||||
Runs a normal search then filters results to only include files
|
||||
under the specified scope path.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
query: Natural language or code search query.
|
||||
scope_path: Relative directory path to limit search scope (e.g. "src/auth").
|
||||
top_k: Maximum number of scoped results to return (default 10).
|
||||
quality: Search quality tier ("fast", "balanced", "thorough", "auto").
|
||||
|
||||
Returns:
|
||||
Search results filtered to the scope path.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
db_path = _db_path_for_project(project_path)
|
||||
if not (db_path / "metadata.db").exists():
|
||||
return f"Error: no index found at {db_path}. Run index_project first."
|
||||
|
||||
# Normalize scope path for prefix matching
|
||||
scope = scope_path.replace("\\", "/").strip("/")
|
||||
|
||||
_, search, _ = _get_pipelines(project_path)
|
||||
# Fetch more results than top_k to account for filtering
|
||||
all_results = search.search(query, top_k=top_k * 5, quality=quality)
|
||||
|
||||
# Filter by scope path prefix
|
||||
scoped = [
|
||||
r for r in all_results
|
||||
if r.path.replace("\\", "/").startswith(scope + "/")
|
||||
or r.path.replace("\\", "/") == scope
|
||||
]
|
||||
|
||||
if not scoped:
|
||||
return f"No results found in scope '{scope_path}'."
|
||||
|
||||
lines = []
|
||||
for i, r in enumerate(scoped[:top_k], 1):
|
||||
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
||||
lines.append(f"```\n{r.content}\n```")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
@@ -275,6 +342,59 @@ async def index_update(
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def index_scope(
|
||||
project_path: str,
|
||||
scope_path: str,
|
||||
glob_pattern: str = "**/*",
|
||||
tier: str = "full",
|
||||
) -> str:
|
||||
"""Index a specific directory scope within a project.
|
||||
|
||||
Useful for quickly indexing a subdirectory (e.g. after editing files
|
||||
in a specific module) without re-indexing the entire project.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
scope_path: Relative directory path to index (e.g. "src/auth").
|
||||
glob_pattern: Glob pattern for files within scope (default "**/*").
|
||||
tier: Indexing tier - "full" (default) runs full pipeline with
|
||||
embedding, "fts_only" indexes text only (faster, no vectors).
|
||||
|
||||
Returns:
|
||||
Indexing summary for the scoped directory.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
scope_dir = root / scope_path
|
||||
if not scope_dir.is_dir():
|
||||
return f"Error: scope directory not found: {scope_dir}"
|
||||
|
||||
valid_tiers = ("full", "fts_only")
|
||||
if tier not in valid_tiers:
|
||||
return f"Error: invalid tier '{tier}'. Must be one of: {', '.join(valid_tiers)}"
|
||||
|
||||
indexing, _, _ = _get_pipelines(project_path)
|
||||
|
||||
file_paths = [
|
||||
p for p in scope_dir.glob(glob_pattern)
|
||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
||||
]
|
||||
|
||||
if not file_paths:
|
||||
return f"No files found in {scope_path} matching '{glob_pattern}'."
|
||||
|
||||
stats = indexing.sync(file_paths, root=root, tier=tier)
|
||||
tier_label = "FTS-only" if tier == "fts_only" else "full"
|
||||
return (
|
||||
f"Indexed {stats.files_processed} files ({tier_label}), "
|
||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. "
|
||||
f"Scope: {scope_path}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File discovery
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -7,7 +7,7 @@ from dataclasses import dataclass
|
||||
import numpy as np
|
||||
|
||||
from ..config import Config
|
||||
from ..core import ANNIndex, BinaryStore
|
||||
from ..core.base import BaseANNIndex, BaseBinaryIndex
|
||||
from ..embed import BaseEmbedder
|
||||
from ..indexing.metadata import MetadataStore
|
||||
from ..rerank import BaseReranker
|
||||
@@ -21,6 +21,8 @@ from .fusion import (
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
_VALID_QUALITIES = ("fast", "balanced", "thorough", "auto")
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
@@ -37,8 +39,8 @@ class SearchPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
binary_store: BinaryStore,
|
||||
ann_index: ANNIndex,
|
||||
binary_store: BaseBinaryIndex,
|
||||
ann_index: BaseANNIndex,
|
||||
reranker: BaseReranker,
|
||||
fts: FTSEngine,
|
||||
config: Config,
|
||||
@@ -52,6 +54,15 @@ class SearchPipeline:
|
||||
self._config = config
|
||||
self._metadata_store = metadata_store
|
||||
|
||||
# -- Helper: check if vector index has data ----------------------------
|
||||
|
||||
def _has_vector_index(self) -> bool:
|
||||
"""Check if the binary store has any indexed entries."""
|
||||
try:
|
||||
return len(self._binary_store) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# -- Helper: vector search (binary coarse + ANN fine) -----------------
|
||||
|
||||
def _vector_search(
|
||||
@@ -84,6 +95,21 @@ class SearchPipeline:
|
||||
]
|
||||
return vector_results
|
||||
|
||||
# -- Helper: binary coarse search only --------------------------------
|
||||
|
||||
def _binary_coarse_search(
|
||||
self, query_vec: np.ndarray
|
||||
) -> list[tuple[int, float]]:
|
||||
"""Run binary coarse search only (no ANN fine search)."""
|
||||
cfg = self._config
|
||||
candidate_ids, distances = self._binary_store.coarse_search(
|
||||
query_vec, top_k=cfg.binary_top_k
|
||||
)
|
||||
return [
|
||||
(int(doc_id), float(dist))
|
||||
for doc_id, dist in zip(candidate_ids, distances)
|
||||
]
|
||||
|
||||
# -- Helper: FTS search (exact + fuzzy) ------------------------------
|
||||
|
||||
def _fts_search(
|
||||
@@ -95,55 +121,12 @@ class SearchPipeline:
|
||||
fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k)
|
||||
return exact_results, fuzzy_results
|
||||
|
||||
# -- Main search entry point -----------------------------------------
|
||||
# -- Helper: filter deleted IDs ---------------------------------------
|
||||
|
||||
def search(self, query: str, top_k: int | None = None) -> list[SearchResult]:
|
||||
cfg = self._config
|
||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
||||
|
||||
# 1. Detect intent -> adaptive weights
|
||||
intent = detect_query_intent(query)
|
||||
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
||||
|
||||
# 2. Embed query
|
||||
query_vec = self._embedder.embed_single(query)
|
||||
|
||||
# 3. Parallel vector + FTS search
|
||||
vector_results: list[tuple[int, float]] = []
|
||||
exact_results: list[tuple[int, float]] = []
|
||||
fuzzy_results: list[tuple[int, float]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
vec_future = pool.submit(self._vector_search, query_vec)
|
||||
fts_future = pool.submit(self._fts_search, query)
|
||||
|
||||
# Collect vector results
|
||||
try:
|
||||
vector_results = vec_future.result()
|
||||
except Exception:
|
||||
_log.warning("Vector search failed, using empty results", exc_info=True)
|
||||
|
||||
# Collect FTS results
|
||||
try:
|
||||
exact_results, fuzzy_results = fts_future.result()
|
||||
except Exception:
|
||||
_log.warning("FTS search failed, using empty results", exc_info=True)
|
||||
|
||||
# 4. RRF fusion
|
||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
||||
if vector_results:
|
||||
fusion_input["vector"] = vector_results
|
||||
if exact_results:
|
||||
fusion_input["exact"] = exact_results
|
||||
if fuzzy_results:
|
||||
fusion_input["fuzzy"] = fuzzy_results
|
||||
|
||||
if not fusion_input:
|
||||
return []
|
||||
|
||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
|
||||
|
||||
# 4b. Filter out deleted IDs (tombstone filtering)
|
||||
def _filter_deleted(
|
||||
self, fused: list[tuple[int, float]]
|
||||
) -> list[tuple[int, float]]:
|
||||
"""Remove tombstoned chunk IDs from results."""
|
||||
if self._metadata_store is not None:
|
||||
deleted_ids = self._metadata_store.get_deleted_ids()
|
||||
if deleted_ids:
|
||||
@@ -152,16 +135,30 @@ class SearchPipeline:
|
||||
for doc_id, score in fused
|
||||
if doc_id not in deleted_ids
|
||||
]
|
||||
return fused
|
||||
|
||||
# 5. Rerank top candidates
|
||||
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
|
||||
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
|
||||
rerank_scores = self._reranker.score_pairs(query, contents)
|
||||
# -- Helper: rerank and build results ---------------------------------
|
||||
|
||||
# 6. Sort by rerank score, build SearchResult list
|
||||
ranked = sorted(
|
||||
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
def _rerank_and_build(
|
||||
self,
|
||||
query: str,
|
||||
fused: list[tuple[int, float]],
|
||||
final_top_k: int,
|
||||
use_reranker: bool = True,
|
||||
) -> list[SearchResult]:
|
||||
"""Rerank candidates (optionally) and build SearchResult list."""
|
||||
if not fused:
|
||||
return []
|
||||
|
||||
if use_reranker:
|
||||
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
|
||||
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
|
||||
rerank_scores = self._reranker.score_pairs(query, contents)
|
||||
ranked = sorted(
|
||||
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
else:
|
||||
ranked = fused
|
||||
|
||||
results: list[SearchResult] = []
|
||||
for doc_id, score in ranked[:final_top_k]:
|
||||
@@ -179,3 +176,178 @@ class SearchPipeline:
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
# -- Helper: record access for tier tracking --------------------------
|
||||
|
||||
def _record_access(self, results: list[SearchResult]) -> None:
|
||||
"""Record file access for data tier tracking."""
|
||||
if results and self._metadata_store is not None:
|
||||
unique_paths = list({r.path for r in results})
|
||||
try:
|
||||
self._metadata_store.record_access_batch(unique_paths)
|
||||
except Exception:
|
||||
_log.debug("Failed to record access for tier tracking", exc_info=True)
|
||||
|
||||
# -- Quality-routed search methods ------------------------------------
|
||||
|
||||
def _search_fast(
|
||||
self, query: str, final_top_k: int
|
||||
) -> list[SearchResult]:
|
||||
"""FTS-only search with reranking. No embedding needed."""
|
||||
exact_results, fuzzy_results = self._fts_search(query)
|
||||
|
||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
||||
if exact_results:
|
||||
fusion_input["exact"] = exact_results
|
||||
if fuzzy_results:
|
||||
fusion_input["fuzzy"] = fuzzy_results
|
||||
|
||||
if not fusion_input:
|
||||
return []
|
||||
|
||||
fused = reciprocal_rank_fusion(
|
||||
fusion_input, weights={"exact": 0.7, "fuzzy": 0.3},
|
||||
k=self._config.fusion_k,
|
||||
)
|
||||
fused = self._filter_deleted(fused)
|
||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
||||
|
||||
def _search_balanced(
|
||||
self, query: str, final_top_k: int
|
||||
) -> list[SearchResult]:
|
||||
"""FTS + binary coarse search with RRF fusion and reranking.
|
||||
|
||||
Embeds the query for binary coarse search but skips ANN fine search.
|
||||
"""
|
||||
intent = detect_query_intent(query)
|
||||
weights = get_adaptive_weights(intent, self._config.fusion_weights)
|
||||
|
||||
query_vec = self._embedder.embed_single(query)
|
||||
|
||||
# Parallel: binary coarse + FTS
|
||||
coarse_results: list[tuple[int, float]] = []
|
||||
exact_results: list[tuple[int, float]] = []
|
||||
fuzzy_results: list[tuple[int, float]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
coarse_future = pool.submit(self._binary_coarse_search, query_vec)
|
||||
fts_future = pool.submit(self._fts_search, query)
|
||||
|
||||
try:
|
||||
coarse_results = coarse_future.result()
|
||||
except Exception:
|
||||
_log.warning("Binary coarse search failed", exc_info=True)
|
||||
|
||||
try:
|
||||
exact_results, fuzzy_results = fts_future.result()
|
||||
except Exception:
|
||||
_log.warning("FTS search failed", exc_info=True)
|
||||
|
||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
||||
if coarse_results:
|
||||
fusion_input["vector"] = coarse_results
|
||||
if exact_results:
|
||||
fusion_input["exact"] = exact_results
|
||||
if fuzzy_results:
|
||||
fusion_input["fuzzy"] = fuzzy_results
|
||||
|
||||
if not fusion_input:
|
||||
return []
|
||||
|
||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=self._config.fusion_k)
|
||||
fused = self._filter_deleted(fused)
|
||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
||||
|
||||
def _search_thorough(
|
||||
self, query: str, final_top_k: int
|
||||
) -> list[SearchResult]:
|
||||
"""Full 2-stage vector + FTS + reranking pipeline (original behavior)."""
|
||||
cfg = self._config
|
||||
|
||||
intent = detect_query_intent(query)
|
||||
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
||||
|
||||
query_vec = self._embedder.embed_single(query)
|
||||
|
||||
# Parallel vector + FTS search
|
||||
vector_results: list[tuple[int, float]] = []
|
||||
exact_results: list[tuple[int, float]] = []
|
||||
fuzzy_results: list[tuple[int, float]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
vec_future = pool.submit(self._vector_search, query_vec)
|
||||
fts_future = pool.submit(self._fts_search, query)
|
||||
|
||||
try:
|
||||
vector_results = vec_future.result()
|
||||
except Exception:
|
||||
_log.warning("Vector search failed, using empty results", exc_info=True)
|
||||
|
||||
try:
|
||||
exact_results, fuzzy_results = fts_future.result()
|
||||
except Exception:
|
||||
_log.warning("FTS search failed, using empty results", exc_info=True)
|
||||
|
||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
||||
if vector_results:
|
||||
fusion_input["vector"] = vector_results
|
||||
if exact_results:
|
||||
fusion_input["exact"] = exact_results
|
||||
if fuzzy_results:
|
||||
fusion_input["fuzzy"] = fuzzy_results
|
||||
|
||||
if not fusion_input:
|
||||
return []
|
||||
|
||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
|
||||
fused = self._filter_deleted(fused)
|
||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
||||
|
||||
# -- Main search entry point -----------------------------------------
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
top_k: int | None = None,
|
||||
quality: str | None = None,
|
||||
) -> list[SearchResult]:
|
||||
"""Search with quality-based routing.
|
||||
|
||||
Args:
|
||||
query: Search query string.
|
||||
top_k: Maximum results to return.
|
||||
quality: Search quality tier:
|
||||
- 'fast': FTS-only + rerank (no embedding, no vector search)
|
||||
- 'balanced': FTS + binary coarse + rerank (no ANN fine search)
|
||||
- 'thorough': Full 2-stage vector + FTS + reranking
|
||||
- 'auto': Selects 'thorough' if vectors exist, else 'fast'
|
||||
- None: Uses config.default_search_quality
|
||||
|
||||
Returns:
|
||||
List of SearchResult ordered by relevance.
|
||||
"""
|
||||
cfg = self._config
|
||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
||||
|
||||
# Resolve quality tier
|
||||
effective_quality = quality or cfg.default_search_quality
|
||||
if effective_quality not in _VALID_QUALITIES:
|
||||
_log.warning(
|
||||
"Invalid search quality '%s', falling back to 'auto'",
|
||||
effective_quality,
|
||||
)
|
||||
effective_quality = "auto"
|
||||
|
||||
# Auto-detect: use thorough if vector index has data, else fast
|
||||
if effective_quality == "auto":
|
||||
effective_quality = "thorough" if self._has_vector_index() else "fast"
|
||||
|
||||
if effective_quality == "fast":
|
||||
results = self._search_fast(query, final_top_k)
|
||||
elif effective_quality == "balanced":
|
||||
results = self._search_balanced(query, final_top_k)
|
||||
else:
|
||||
results = self._search_thorough(query, final_top_k)
|
||||
|
||||
self._record_access(results)
|
||||
return results
|
||||
|
||||
@@ -20,6 +20,7 @@ from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
|
||||
from .events import ChangeType, FileEvent, WatcherConfig
|
||||
from .incremental_indexer import IncrementalIndexer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -261,3 +262,24 @@ class FileWatcher:
|
||||
if output:
|
||||
sys.stdout.write(output + "\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
@classmethod
|
||||
def create_with_indexer(
|
||||
cls,
|
||||
root_path: Path,
|
||||
config: WatcherConfig,
|
||||
indexer: IncrementalIndexer,
|
||||
) -> "FileWatcher":
|
||||
"""Create a FileWatcher wired to an IncrementalIndexer's async path.
|
||||
|
||||
Uses ``indexer.process_events_async()`` as the callback so that
|
||||
events are debounced and batched within the indexer before
|
||||
processing, preventing redundant per-file pipeline startups.
|
||||
|
||||
Example::
|
||||
|
||||
indexer = IncrementalIndexer(pipeline, root=root)
|
||||
watcher = FileWatcher.create_with_indexer(root, config, indexer)
|
||||
watcher.start()
|
||||
"""
|
||||
return cls(root_path, config, indexer.process_events_async)
|
||||
|
||||
@@ -4,10 +4,13 @@ Ported from codex-lens v1 with simplifications:
|
||||
- Uses IndexingPipeline.index_file() / remove_file() directly
|
||||
- No v1-specific Config, ParserFactory, DirIndexStore dependencies
|
||||
- Per-file error isolation: one failure does not stop batch processing
|
||||
- Debounce batching: process_events_async() buffers events and flushes
|
||||
after a configurable window to prevent redundant per-file pipeline startups
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
@@ -60,6 +63,7 @@ class IncrementalIndexer:
|
||||
pipeline: IndexingPipeline,
|
||||
*,
|
||||
root: Optional[Path] = None,
|
||||
debounce_window_ms: int = 500,
|
||||
) -> None:
|
||||
"""Initialize the incremental indexer.
|
||||
|
||||
@@ -67,9 +71,15 @@ class IncrementalIndexer:
|
||||
pipeline: The indexing pipeline with metadata store configured.
|
||||
root: Optional project root for computing relative paths.
|
||||
If None, absolute paths are used as identifiers.
|
||||
debounce_window_ms: Milliseconds to buffer events before flushing
|
||||
in process_events_async(). Default 500ms.
|
||||
"""
|
||||
self._pipeline = pipeline
|
||||
self._root = root
|
||||
self._debounce_window_ms = debounce_window_ms
|
||||
self._event_buffer: List[FileEvent] = []
|
||||
self._buffer_lock = threading.Lock()
|
||||
self._flush_timer: Optional[threading.Timer] = None
|
||||
|
||||
def process_events(self, events: List[FileEvent]) -> BatchResult:
|
||||
"""Process a batch of file events with per-file error isolation.
|
||||
@@ -107,6 +117,52 @@ class IncrementalIndexer:
|
||||
|
||||
return result
|
||||
|
||||
def process_events_async(self, events: List[FileEvent]) -> None:
|
||||
"""Buffer events and flush after the debounce window expires.
|
||||
|
||||
Non-blocking: events are accumulated in an internal buffer.
|
||||
When no new events arrive within *debounce_window_ms*, the buffer
|
||||
is flushed and all accumulated events are processed as a single
|
||||
batch via process_events().
|
||||
|
||||
Args:
|
||||
events: List of file events to buffer.
|
||||
"""
|
||||
with self._buffer_lock:
|
||||
self._event_buffer.extend(events)
|
||||
|
||||
# Cancel previous timer and start a new one (true debounce)
|
||||
if self._flush_timer is not None:
|
||||
self._flush_timer.cancel()
|
||||
|
||||
self._flush_timer = threading.Timer(
|
||||
self._debounce_window_ms / 1000.0,
|
||||
self._flush_buffer,
|
||||
)
|
||||
self._flush_timer.daemon = True
|
||||
self._flush_timer.start()
|
||||
|
||||
def _flush_buffer(self) -> None:
|
||||
"""Flush the event buffer and process all accumulated events."""
|
||||
with self._buffer_lock:
|
||||
if not self._event_buffer:
|
||||
return
|
||||
events = list(self._event_buffer)
|
||||
self._event_buffer.clear()
|
||||
self._flush_timer = None
|
||||
|
||||
# Deduplicate: keep the last event per path
|
||||
seen: dict[Path, FileEvent] = {}
|
||||
for event in events:
|
||||
seen[event.path] = event
|
||||
deduped = list(seen.values())
|
||||
|
||||
logger.debug(
|
||||
"Flushing debounce buffer: %d events (%d after dedup)",
|
||||
len(events), len(deduped),
|
||||
)
|
||||
self.process_events(deduped)
|
||||
|
||||
def _handle_index(self, event: FileEvent, result: BatchResult) -> None:
|
||||
"""Index a created or modified file."""
|
||||
stats = self._pipeline.index_file(
|
||||
|
||||
Reference in New Issue
Block a user