mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-19 18:58:47 +08:00
feat: enhance search, ranking, reranker and CLI tooling across ccw and codex-lens
Major improvements to smart-search, chain-search cascade, ranking pipeline, reranker factory, CLI history store, codex-lens integration, and uv-manager. Simplify command-generator skill by inlining phases. Add comprehensive tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,56 @@ pip install codex-lens[semantic-directml]
|
||||
pip install codex-lens[full]
|
||||
```
|
||||
|
||||
### Local ONNX Reranker Bootstrap
|
||||
|
||||
Use the pinned bootstrap flow when you want the local-only reranker backend in an
|
||||
existing CodexLens virtual environment without asking pip to resolve the whole
|
||||
project extras set at once.
|
||||
|
||||
1. Start from the CodexLens repo root and create or activate the project venv.
|
||||
2. Review the pinned install manifest in `scripts/requirements-reranker-local.txt`.
|
||||
3. Render the deterministic setup plan:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --dry-run
|
||||
```
|
||||
|
||||
The bootstrap script always targets the selected venv Python, installs the local
|
||||
ONNX reranker stack in a fixed order, and keeps the package set pinned to the
|
||||
validated Python 3.13-compatible combination:
|
||||
|
||||
- `numpy==2.4.0`
|
||||
- `onnxruntime==1.23.2`
|
||||
- `huggingface-hub==0.36.2`
|
||||
- `transformers==4.53.3`
|
||||
- `optimum[onnxruntime]==2.1.0`
|
||||
|
||||
When you are ready to apply it to the CodexLens venv, use:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply
|
||||
```
|
||||
|
||||
To pre-download the default local reranker model (`Xenova/ms-marco-MiniLM-L-6-v2`)
|
||||
into the repo-local Hugging Face cache, use:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply --download-model
|
||||
```
|
||||
|
||||
The dry-run plan also prints the equivalent explicit model download command. On
|
||||
Windows PowerShell with the default repo venv, it looks like:
|
||||
|
||||
```bash
|
||||
.venv/Scripts/hf.exe download Xenova/ms-marco-MiniLM-L-6-v2 --local-dir .cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2
|
||||
```
|
||||
|
||||
After installation, probe the backend from the same venv:
|
||||
|
||||
```bash
|
||||
python scripts/bootstrap_reranker_local.py --apply --probe
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python >= 3.10
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
{"query":"executeHybridMode dense_rerank semantic smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-semantic-routing","notes":"CCW semantic mode delegates to CodexLens dense_rerank."}
|
||||
{"query":"parse CodexLens JSON output strip ANSI smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-json-fallback","notes":"Covers JSON/plain-text fallback handling for CodexLens output."}
|
||||
{"query":"smart_search init embed search action schema","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-action-schema","notes":"Find the Zod schema that defines init/embed/search actions."}
|
||||
{"query":"auto init missing job dedupe smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-auto-init","notes":"Targets background init/embed warmup and dedupe state."}
|
||||
{"query":"smart_search exact mode fallback to CodexLens fts","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-exact-fallback","notes":"Tracks the exact-mode fallback path into CodexLens FTS."}
|
||||
{"query":"smart_search settings snapshot embedding backend reranker backend staged stage2 mode","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-config-snapshot","notes":"Reads local config snapshot for embedding/reranker/staged pipeline settings."}
|
||||
{"query":"embedding backend fastembed local litellm api config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-embedding-config","notes":"Local-only benchmark should resolve to fastembed defaults."}
|
||||
{"query":"reranker backend onnx api legacy configuration","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-reranker-config","notes":"Covers both config dataclass fields and env overrides."}
|
||||
{"query":"staged stage2 mode precomputed realtime static_global_graph","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-stage2-config","notes":"Benchmark matrix should exercise the three supported stage2 modes."}
|
||||
{"query":"enable staged rerank stage 4 config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-stage4-rerank","notes":"Stage 4 rerank flag needs to stay enabled for local benchmarks."}
|
||||
{"query":"cascade_search dense_rerank staged pipeline ChainSearchEngine","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-cascade","notes":"Baseline query for the central retrieval engine."}
|
||||
{"query":"realtime LSP expand stage2 search pipeline","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-realtime","notes":"Targets realtime stage2 expansion logic."}
|
||||
{"query":"static global graph stage2 expansion implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-static","notes":"Targets static_global_graph stage2 expansion logic."}
|
||||
{"query":"cross encoder rerank stage 4 implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-rerank","notes":"Relevant for dense_rerank and staged rerank latency comparisons."}
|
||||
{"query":"get_reranker factory onnx backend selection","relevant_paths":["codex-lens/src/codexlens/semantic/reranker/factory.py"],"intent":"reranker-factory","notes":"Keeps the benchmark aligned with local ONNX reranker selection."}
|
||||
{"query":"EMBEDDING_BACKEND and RERANKER_BACKEND environment variables","relevant_paths":["codex-lens/src/codexlens/env_config.py"],"intent":"env-overrides","notes":"Covers CCW/CodexLens local-only environment overrides."}
|
||||
@@ -239,6 +239,7 @@ def main() -> None:
|
||||
config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower()
|
||||
# Stability: on some Windows setups, DirectML/ONNX can crash under load.
|
||||
config.embedding_use_gpu = False
|
||||
config.reranker_use_gpu = False
|
||||
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
@@ -362,4 +363,3 @@ def main() -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
980
codex-lens/benchmarks/compare_ccw_smart_search_stage2.py
Normal file
980
codex-lens/benchmarks/compare_ccw_smart_search_stage2.py
Normal file
@@ -0,0 +1,980 @@
|
||||
#!/usr/bin/env python
|
||||
"""Benchmark local-only staged stage2 modes for CCW smart_search queries.
|
||||
|
||||
This benchmark reuses the existing CodexLens benchmark style, but focuses on
|
||||
the real search intents that drive CCW `smart_search`. It evaluates:
|
||||
|
||||
1. `dense_rerank` baseline
|
||||
2. `staged` + `precomputed`
|
||||
3. `staged` + `realtime`
|
||||
4. `staged` + `static_global_graph`
|
||||
|
||||
Metrics:
|
||||
- Hit@K
|
||||
- MRR@K
|
||||
- Recall@K
|
||||
- latency (avg/p50/p95)
|
||||
|
||||
The runner is intentionally local-only. By default it uses:
|
||||
- embedding backend: `fastembed`
|
||||
- reranker backend: `onnx`
|
||||
|
||||
Examples:
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --dry-run
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --self-check
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --source .. --k 10
|
||||
python benchmarks/compare_ccw_smart_search_stage2.py --embedding-model code --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from copy import deepcopy
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.search.ranking import (
|
||||
QueryIntent,
|
||||
detect_query_intent,
|
||||
is_generated_artifact_path,
|
||||
is_test_file,
|
||||
query_prefers_lexical_search,
|
||||
query_targets_generated_files,
|
||||
)
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
DEFAULT_SOURCE = Path(__file__).resolve().parents[2]
|
||||
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_ccw_smart_search.jsonl"
|
||||
DEFAULT_OUTPUT = Path(__file__).parent / "results" / "ccw_smart_search_stage2.json"
|
||||
|
||||
VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph")
|
||||
VALID_LOCAL_EMBEDDING_BACKENDS = ("fastembed",)
|
||||
VALID_LOCAL_RERANKER_BACKENDS = ("onnx", "fastembed", "legacy")
|
||||
VALID_BASELINE_METHODS = ("auto", "fts", "hybrid")
|
||||
DEFAULT_LOCAL_ONNX_RERANKER_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2"
|
||||
|
||||
|
||||
def _now_ms() -> float:
|
||||
return time.perf_counter() * 1000.0
|
||||
|
||||
|
||||
def _normalize_path_key(path: str) -> str:
|
||||
try:
|
||||
candidate = Path(path)
|
||||
if str(candidate) and (candidate.is_absolute() or re.match(r"^[A-Za-z]:", str(candidate))):
|
||||
normalized = str(candidate.resolve())
|
||||
else:
|
||||
normalized = str(candidate)
|
||||
except Exception:
|
||||
normalized = path
|
||||
normalized = normalized.replace("/", "\\")
|
||||
if os.name == "nt":
|
||||
normalized = normalized.lower()
|
||||
return normalized
|
||||
|
||||
|
||||
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
|
||||
output: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for path in paths:
|
||||
if path in seen:
|
||||
continue
|
||||
seen.add(path)
|
||||
output.append(path)
|
||||
if len(output) >= k:
|
||||
break
|
||||
return output
|
||||
|
||||
|
||||
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
|
||||
for index, path in enumerate(topk_paths, start=1):
|
||||
if path in relevant:
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def _mrr(ranks: Sequence[Optional[int]]) -> float:
|
||||
values = [1.0 / rank for rank in ranks if rank and rank > 0]
|
||||
return statistics.mean(values) if values else 0.0
|
||||
|
||||
|
||||
def _mean(values: Sequence[float]) -> float:
|
||||
return statistics.mean(values) if values else 0.0
|
||||
|
||||
|
||||
def _percentile(values: Sequence[float], percentile: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
ordered = sorted(values)
|
||||
if len(ordered) == 1:
|
||||
return ordered[0]
|
||||
index = (len(ordered) - 1) * percentile
|
||||
lower = int(index)
|
||||
upper = min(lower + 1, len(ordered) - 1)
|
||||
if lower == upper:
|
||||
return ordered[lower]
|
||||
fraction = index - lower
|
||||
return ordered[lower] + (ordered[upper] - ordered[lower]) * fraction
|
||||
|
||||
|
||||
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
|
||||
if not path.is_file():
|
||||
raise SystemExit(f"Queries file does not exist: {path}")
|
||||
|
||||
output: List[Dict[str, Any]] = []
|
||||
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except Exception as exc:
|
||||
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
|
||||
if not isinstance(item, dict) or "query" not in item or "relevant_paths" not in item:
|
||||
raise SystemExit(f"Invalid query item (expected object with query/relevant_paths): {item!r}")
|
||||
relevant_paths = item.get("relevant_paths")
|
||||
if not isinstance(relevant_paths, list) or not relevant_paths:
|
||||
raise SystemExit(f"Query item must include non-empty relevant_paths[]: {item!r}")
|
||||
output.append(item)
|
||||
if limit is not None and len(output) >= limit:
|
||||
break
|
||||
return output
|
||||
|
||||
|
||||
def _resolve_expected_paths(source_root: Path, paths: Sequence[str]) -> Tuple[List[str], set[str], List[str]]:
|
||||
resolved_display: List[str] = []
|
||||
resolved_keys: set[str] = set()
|
||||
missing: List[str] = []
|
||||
|
||||
for raw_path in paths:
|
||||
candidate = Path(raw_path)
|
||||
if not candidate.is_absolute():
|
||||
candidate = (source_root / candidate).resolve()
|
||||
if not candidate.exists():
|
||||
missing.append(str(candidate))
|
||||
resolved_display.append(str(candidate))
|
||||
resolved_keys.add(_normalize_path_key(str(candidate)))
|
||||
return resolved_display, resolved_keys, missing
|
||||
|
||||
|
||||
def _validate_local_only_backends(embedding_backend: str, reranker_backend: str) -> None:
|
||||
if embedding_backend not in VALID_LOCAL_EMBEDDING_BACKENDS:
|
||||
raise SystemExit(
|
||||
"This runner is local-only. "
|
||||
f"--embedding-backend must be one of {', '.join(VALID_LOCAL_EMBEDDING_BACKENDS)}; got {embedding_backend!r}"
|
||||
)
|
||||
if reranker_backend not in VALID_LOCAL_RERANKER_BACKENDS:
|
||||
raise SystemExit(
|
||||
"This runner is local-only. "
|
||||
f"--reranker-backend must be one of {', '.join(VALID_LOCAL_RERANKER_BACKENDS)}; got {reranker_backend!r}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_stage2_modes(stage2_modes: Sequence[str]) -> List[str]:
|
||||
normalized = [str(mode).strip().lower() for mode in stage2_modes if str(mode).strip()]
|
||||
if not normalized:
|
||||
raise SystemExit("At least one --stage2-modes entry is required")
|
||||
invalid = [mode for mode in normalized if mode not in VALID_STAGE2_MODES]
|
||||
if invalid:
|
||||
raise SystemExit(
|
||||
f"Invalid --stage2-modes entry: {invalid[0]} "
|
||||
f"(valid: {', '.join(VALID_STAGE2_MODES)})"
|
||||
)
|
||||
deduped: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for mode in normalized:
|
||||
if mode in seen:
|
||||
continue
|
||||
seen.add(mode)
|
||||
deduped.append(mode)
|
||||
return deduped
|
||||
|
||||
|
||||
def _validate_baseline_methods(methods: Sequence[str]) -> List[str]:
|
||||
normalized = [str(method).strip().lower() for method in methods if str(method).strip()]
|
||||
invalid = [method for method in normalized if method not in VALID_BASELINE_METHODS]
|
||||
if invalid:
|
||||
raise SystemExit(
|
||||
f"Invalid --baseline-methods entry: {invalid[0]} "
|
||||
f"(valid: {', '.join(VALID_BASELINE_METHODS)})"
|
||||
)
|
||||
deduped: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for method in normalized:
|
||||
if method in seen:
|
||||
continue
|
||||
seen.add(method)
|
||||
deduped.append(method)
|
||||
return deduped
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyRun:
|
||||
strategy_key: str
|
||||
strategy: str
|
||||
stage2_mode: Optional[str]
|
||||
effective_method: str
|
||||
execution_method: str
|
||||
latency_ms: float
|
||||
topk_paths: List[str]
|
||||
first_hit_rank: Optional[int]
|
||||
hit_at_k: bool
|
||||
recall_at_k: float
|
||||
generated_artifact_count: int
|
||||
test_file_count: int
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryEvaluation:
|
||||
query: str
|
||||
intent: Optional[str]
|
||||
notes: Optional[str]
|
||||
relevant_paths: List[str]
|
||||
runs: Dict[str, StrategyRun]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PairwiseDelta:
|
||||
mode_a: str
|
||||
mode_b: str
|
||||
hit_at_k_delta: float
|
||||
mrr_at_k_delta: float
|
||||
avg_recall_at_k_delta: float
|
||||
avg_latency_ms_delta: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategySpec:
|
||||
strategy_key: str
|
||||
strategy: str
|
||||
stage2_mode: Optional[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyRuntime:
|
||||
strategy_spec: StrategySpec
|
||||
config: Config
|
||||
registry: RegistryStore
|
||||
engine: ChainSearchEngine
|
||||
|
||||
|
||||
def _strategy_specs(
|
||||
stage2_modes: Sequence[str],
|
||||
include_dense_baseline: bool,
|
||||
*,
|
||||
baseline_methods: Sequence[str],
|
||||
) -> List[StrategySpec]:
|
||||
specs: List[StrategySpec] = []
|
||||
for method in baseline_methods:
|
||||
specs.append(StrategySpec(strategy_key=method, strategy=method, stage2_mode=None))
|
||||
if include_dense_baseline:
|
||||
specs.append(StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None))
|
||||
for stage2_mode in stage2_modes:
|
||||
specs.append(
|
||||
StrategySpec(
|
||||
strategy_key=f"staged:{stage2_mode}",
|
||||
strategy="staged",
|
||||
stage2_mode=stage2_mode,
|
||||
)
|
||||
)
|
||||
return specs
|
||||
|
||||
|
||||
def _build_strategy_runtime(base_config: Config, strategy_spec: StrategySpec) -> StrategyRuntime:
|
||||
runtime_config = deepcopy(base_config)
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=runtime_config)
|
||||
return StrategyRuntime(
|
||||
strategy_spec=strategy_spec,
|
||||
config=runtime_config,
|
||||
registry=registry,
|
||||
engine=engine,
|
||||
)
|
||||
|
||||
|
||||
def _select_effective_method(query: str, requested_method: str) -> str:
|
||||
requested = str(requested_method).strip().lower()
|
||||
if requested != "auto":
|
||||
return requested
|
||||
if query_targets_generated_files(query) or query_prefers_lexical_search(query):
|
||||
return "fts"
|
||||
intent = detect_query_intent(query)
|
||||
if intent == QueryIntent.KEYWORD:
|
||||
return "fts"
|
||||
if intent == QueryIntent.SEMANTIC:
|
||||
return "dense_rerank"
|
||||
return "hybrid"
|
||||
|
||||
|
||||
def _filter_dataset_by_query_match(
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
query_match: Optional[str],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Filter labeled queries by case-insensitive substring match."""
|
||||
needle = str(query_match or "").strip().casefold()
|
||||
if not needle:
|
||||
return list(dataset)
|
||||
return [
|
||||
dict(item)
|
||||
for item in dataset
|
||||
if needle in str(item.get("query", "")).casefold()
|
||||
]
|
||||
|
||||
|
||||
def _apply_query_limit(
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
query_limit: Optional[int],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply the optional query limit after any dataset-level filtering."""
|
||||
if query_limit is None:
|
||||
return list(dataset)
|
||||
return [dict(item) for item in list(dataset)[: max(0, int(query_limit))]]
|
||||
|
||||
|
||||
def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None:
|
||||
"""Persist a benchmark payload as UTF-8 JSON."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def _write_final_outputs(
|
||||
*,
|
||||
output_path: Path,
|
||||
progress_output: Optional[Path],
|
||||
payload: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Persist the final completed payload to both result and progress outputs."""
|
||||
_write_json_payload(output_path, payload)
|
||||
if progress_output is not None:
|
||||
_write_json_payload(progress_output, payload)
|
||||
|
||||
|
||||
def _make_progress_payload(
|
||||
*,
|
||||
args: argparse.Namespace,
|
||||
source_root: Path,
|
||||
strategy_specs: Sequence[StrategySpec],
|
||||
evaluations: Sequence[QueryEvaluation],
|
||||
query_index: int,
|
||||
total_queries: int,
|
||||
run_index: int,
|
||||
total_runs: int,
|
||||
current_query: str,
|
||||
current_strategy_key: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a partial progress snapshot for long benchmark runs."""
|
||||
return {
|
||||
"status": "running",
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(evaluations),
|
||||
"planned_query_count": total_queries,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
|
||||
"progress": {
|
||||
"completed_queries": query_index,
|
||||
"total_queries": total_queries,
|
||||
"completed_runs": run_index,
|
||||
"total_runs": total_runs,
|
||||
"current_query": current_query,
|
||||
"current_strategy_key": current_strategy_key,
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": evaluation.query,
|
||||
"intent": evaluation.intent,
|
||||
"notes": evaluation.notes,
|
||||
"relevant_paths": evaluation.relevant_paths,
|
||||
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
|
||||
}
|
||||
for evaluation in evaluations
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _make_search_options(method: str, *, k: int) -> SearchOptions:
|
||||
normalized = str(method).strip().lower()
|
||||
if normalized == "fts":
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=False,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=False,
|
||||
pure_vector=False,
|
||||
enable_cascade=False,
|
||||
)
|
||||
if normalized == "hybrid":
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=True,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=False,
|
||||
enable_cascade=False,
|
||||
)
|
||||
if normalized in {"dense_rerank", "staged"}:
|
||||
return SearchOptions(
|
||||
total_limit=k,
|
||||
hybrid_mode=True,
|
||||
enable_fuzzy=False,
|
||||
enable_vector=True,
|
||||
pure_vector=False,
|
||||
enable_cascade=True,
|
||||
)
|
||||
raise ValueError(f"Unsupported benchmark method: {method}")
|
||||
|
||||
|
||||
def _run_strategy(
|
||||
engine: ChainSearchEngine,
|
||||
config: Config,
|
||||
*,
|
||||
strategy_spec: StrategySpec,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
relevant: set[str],
|
||||
) -> StrategyRun:
|
||||
gc.collect()
|
||||
effective_method = _select_effective_method(query, strategy_spec.strategy)
|
||||
execution_method = "cascade" if effective_method in {"dense_rerank", "staged"} else effective_method
|
||||
previous_cascade_strategy = getattr(config, "cascade_strategy", None)
|
||||
previous_stage2_mode = getattr(config, "staged_stage2_mode", None)
|
||||
|
||||
start_ms = _now_ms()
|
||||
try:
|
||||
options = _make_search_options(
|
||||
"staged" if strategy_spec.strategy == "staged" else effective_method,
|
||||
k=k,
|
||||
)
|
||||
if strategy_spec.strategy == "staged":
|
||||
config.cascade_strategy = "staged"
|
||||
if strategy_spec.stage2_mode:
|
||||
config.staged_stage2_mode = strategy_spec.stage2_mode
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy="staged",
|
||||
)
|
||||
elif effective_method == "dense_rerank":
|
||||
config.cascade_strategy = "dense_rerank"
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=k,
|
||||
coarse_k=coarse_k,
|
||||
options=options,
|
||||
strategy="dense_rerank",
|
||||
)
|
||||
else:
|
||||
result = engine.search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
options=options,
|
||||
)
|
||||
latency_ms = _now_ms() - start_ms
|
||||
paths_raw = [item.path for item in (result.results or []) if getattr(item, "path", None)]
|
||||
topk = _dedup_topk((_normalize_path_key(path) for path in paths_raw), k=k)
|
||||
rank = _first_hit_rank(topk, relevant)
|
||||
recall = 0.0
|
||||
if relevant:
|
||||
recall = len(set(topk) & relevant) / float(len(relevant))
|
||||
return StrategyRun(
|
||||
strategy_key=strategy_spec.strategy_key,
|
||||
strategy=strategy_spec.strategy,
|
||||
stage2_mode=strategy_spec.stage2_mode,
|
||||
effective_method=effective_method,
|
||||
execution_method=execution_method,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=topk,
|
||||
first_hit_rank=rank,
|
||||
hit_at_k=rank is not None,
|
||||
recall_at_k=recall,
|
||||
generated_artifact_count=sum(1 for path in topk if is_generated_artifact_path(path)),
|
||||
test_file_count=sum(1 for path in topk if is_test_file(path)),
|
||||
error=None,
|
||||
)
|
||||
except Exception as exc:
|
||||
latency_ms = _now_ms() - start_ms
|
||||
return StrategyRun(
|
||||
strategy_key=strategy_spec.strategy_key,
|
||||
strategy=strategy_spec.strategy,
|
||||
stage2_mode=strategy_spec.stage2_mode,
|
||||
effective_method=effective_method,
|
||||
execution_method=execution_method,
|
||||
latency_ms=latency_ms,
|
||||
topk_paths=[],
|
||||
first_hit_rank=None,
|
||||
hit_at_k=False,
|
||||
recall_at_k=0.0,
|
||||
generated_artifact_count=0,
|
||||
test_file_count=0,
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
finally:
|
||||
config.cascade_strategy = previous_cascade_strategy
|
||||
config.staged_stage2_mode = previous_stage2_mode
|
||||
|
||||
|
||||
def _summarize_runs(runs: Sequence[StrategyRun]) -> Dict[str, Any]:
|
||||
latencies = [run.latency_ms for run in runs if not run.error]
|
||||
ranks = [run.first_hit_rank for run in runs]
|
||||
effective_method_counts: Dict[str, int] = {}
|
||||
for run in runs:
|
||||
effective_method_counts[run.effective_method] = effective_method_counts.get(run.effective_method, 0) + 1
|
||||
return {
|
||||
"query_count": len(runs),
|
||||
"hit_at_k": _mean([1.0 if run.hit_at_k else 0.0 for run in runs]),
|
||||
"mrr_at_k": _mrr(ranks),
|
||||
"avg_recall_at_k": _mean([run.recall_at_k for run in runs]),
|
||||
"avg_latency_ms": _mean(latencies),
|
||||
"p50_latency_ms": _percentile(latencies, 0.50),
|
||||
"p95_latency_ms": _percentile(latencies, 0.95),
|
||||
"avg_generated_artifact_count": _mean([float(run.generated_artifact_count) for run in runs]),
|
||||
"avg_test_file_count": _mean([float(run.test_file_count) for run in runs]),
|
||||
"runs_with_generated_artifacts": sum(1 for run in runs if run.generated_artifact_count > 0),
|
||||
"runs_with_test_files": sum(1 for run in runs if run.test_file_count > 0),
|
||||
"effective_methods": effective_method_counts,
|
||||
"errors": sum(1 for run in runs if run.error),
|
||||
}
|
||||
|
||||
|
||||
def _build_pairwise_deltas(stage2_summaries: Dict[str, Dict[str, Any]]) -> List[PairwiseDelta]:
|
||||
modes = list(stage2_summaries.keys())
|
||||
deltas: List[PairwiseDelta] = []
|
||||
for left_index in range(len(modes)):
|
||||
for right_index in range(left_index + 1, len(modes)):
|
||||
left = modes[left_index]
|
||||
right = modes[right_index]
|
||||
left_summary = stage2_summaries[left]
|
||||
right_summary = stage2_summaries[right]
|
||||
deltas.append(
|
||||
PairwiseDelta(
|
||||
mode_a=left,
|
||||
mode_b=right,
|
||||
hit_at_k_delta=left_summary["hit_at_k"] - right_summary["hit_at_k"],
|
||||
mrr_at_k_delta=left_summary["mrr_at_k"] - right_summary["mrr_at_k"],
|
||||
avg_recall_at_k_delta=left_summary["avg_recall_at_k"] - right_summary["avg_recall_at_k"],
|
||||
avg_latency_ms_delta=left_summary["avg_latency_ms"] - right_summary["avg_latency_ms"],
|
||||
)
|
||||
)
|
||||
return deltas
|
||||
|
||||
|
||||
def _make_plan_payload(
|
||||
*,
|
||||
args: argparse.Namespace,
|
||||
source_root: Path,
|
||||
dataset: Sequence[Dict[str, Any]],
|
||||
baseline_methods: Sequence[str],
|
||||
stage2_modes: Sequence[str],
|
||||
strategy_specs: Sequence[StrategySpec],
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"mode": "dry-run" if args.dry_run else "self-check",
|
||||
"local_only": True,
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(dataset),
|
||||
"query_match": args.query_match,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"baseline_methods": list(baseline_methods),
|
||||
"stage2_modes": list(stage2_modes),
|
||||
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
|
||||
"local_backends": {
|
||||
"embedding_backend": args.embedding_backend,
|
||||
"embedding_model": args.embedding_model,
|
||||
"reranker_backend": args.reranker_backend,
|
||||
"reranker_model": args.reranker_model,
|
||||
"embedding_use_gpu": bool(args.embedding_use_gpu),
|
||||
"reranker_use_gpu": bool(args.reranker_use_gpu),
|
||||
},
|
||||
"output": str(args.output),
|
||||
"progress_output": str(args.progress_output) if args.progress_output else None,
|
||||
"dataset_preview": [
|
||||
{
|
||||
"query": item.get("query"),
|
||||
"intent": item.get("intent"),
|
||||
"relevant_paths": item.get("relevant_paths"),
|
||||
}
|
||||
for item in list(dataset)[: min(3, len(dataset))]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
type=Path,
|
||||
default=DEFAULT_SOURCE,
|
||||
help="Source root to benchmark. Defaults to the repository root so CCW and CodexLens paths resolve together.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file",
|
||||
type=Path,
|
||||
default=DEFAULT_QUERIES_FILE,
|
||||
help="Labeled JSONL dataset of CCW smart_search queries",
|
||||
)
|
||||
parser.add_argument("--query-limit", type=int, default=None, help="Optional query limit")
|
||||
parser.add_argument(
|
||||
"--query-match",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional case-insensitive substring filter for selecting specific benchmark queries.",
|
||||
)
|
||||
parser.add_argument("--k", type=int, default=10, help="Top-k to evaluate")
|
||||
parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k")
|
||||
parser.add_argument(
|
||||
"--baseline-methods",
|
||||
nargs="*",
|
||||
default=list(VALID_BASELINE_METHODS),
|
||||
help="Requested smart_search baselines to compare before staged modes (valid: auto, fts, hybrid).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stage2-modes",
|
||||
nargs="*",
|
||||
default=list(VALID_STAGE2_MODES),
|
||||
help="Stage-2 modes to compare",
|
||||
)
|
||||
parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per strategy")
|
||||
parser.add_argument(
|
||||
"--embedding-backend",
|
||||
default="fastembed",
|
||||
help="Local embedding backend. This runner only accepts fastembed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
default="code",
|
||||
help="Embedding model/profile for the local embedding backend",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-use-gpu",
|
||||
action="store_true",
|
||||
help="Enable GPU acceleration for local embeddings. Off by default for stability.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-backend",
|
||||
default="onnx",
|
||||
help="Local reranker backend. Supported local values: onnx, fastembed, legacy.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-model",
|
||||
default=DEFAULT_LOCAL_ONNX_RERANKER_MODEL,
|
||||
help="Reranker model name for the local reranker backend",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reranker-use-gpu",
|
||||
action="store_true",
|
||||
help="Enable GPU acceleration for the local reranker. Off by default for stability.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-dense-baseline",
|
||||
action="store_true",
|
||||
help="Only compare staged stage2 modes and skip the dense_rerank baseline.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Validate dataset/config and print the benchmark plan without running retrieval.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
help="Smoke-check the entrypoint by validating dataset, source paths, and stage matrix wiring.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=DEFAULT_OUTPUT,
|
||||
help="Output JSON path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--progress-output",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional JSON path updated after each query with partial progress and completed runs.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
source_root = args.source.expanduser().resolve()
|
||||
if not source_root.exists():
|
||||
raise SystemExit(f"Source path does not exist: {source_root}")
|
||||
if int(args.k) <= 0:
|
||||
raise SystemExit("--k must be > 0")
|
||||
if int(args.coarse_k) <= 0:
|
||||
raise SystemExit("--coarse-k must be > 0")
|
||||
if int(args.coarse_k) < int(args.k):
|
||||
raise SystemExit("--coarse-k must be >= --k")
|
||||
if int(args.warmup) < 0:
|
||||
raise SystemExit("--warmup must be >= 0")
|
||||
|
||||
embedding_backend = str(args.embedding_backend).strip().lower()
|
||||
reranker_backend = str(args.reranker_backend).strip().lower()
|
||||
_validate_local_only_backends(embedding_backend, reranker_backend)
|
||||
baseline_methods = _validate_baseline_methods(args.baseline_methods)
|
||||
stage2_modes = _validate_stage2_modes(args.stage2_modes)
|
||||
|
||||
dataset = _load_labeled_queries(args.queries_file, None)
|
||||
dataset = _filter_dataset_by_query_match(dataset, args.query_match)
|
||||
dataset = _apply_query_limit(dataset, args.query_limit)
|
||||
if not dataset:
|
||||
raise SystemExit("No queries to run")
|
||||
|
||||
missing_paths: List[str] = []
|
||||
for item in dataset:
|
||||
_, _, item_missing = _resolve_expected_paths(source_root, [str(path) for path in item["relevant_paths"]])
|
||||
missing_paths.extend(item_missing)
|
||||
if missing_paths:
|
||||
preview = ", ".join(missing_paths[:3])
|
||||
raise SystemExit(
|
||||
"Dataset relevant_paths do not resolve under the selected source root. "
|
||||
f"Examples: {preview}"
|
||||
)
|
||||
|
||||
strategy_specs = _strategy_specs(
|
||||
stage2_modes,
|
||||
include_dense_baseline=not args.skip_dense_baseline,
|
||||
baseline_methods=baseline_methods,
|
||||
)
|
||||
|
||||
if args.dry_run or args.self_check:
|
||||
payload = _make_plan_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
dataset=dataset,
|
||||
baseline_methods=baseline_methods,
|
||||
stage2_modes=stage2_modes,
|
||||
strategy_specs=strategy_specs,
|
||||
)
|
||||
if args.self_check:
|
||||
payload["status"] = "ok"
|
||||
payload["checks"] = {
|
||||
"dataset_loaded": True,
|
||||
"stage2_matrix_size": len(stage2_modes),
|
||||
"local_only_validation": True,
|
||||
"source_path_exists": True,
|
||||
}
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
return
|
||||
|
||||
config = Config.load()
|
||||
config.cascade_strategy = "staged"
|
||||
config.enable_staged_rerank = True
|
||||
config.enable_cross_encoder_rerank = True
|
||||
config.embedding_backend = embedding_backend
|
||||
config.embedding_model = str(args.embedding_model).strip()
|
||||
config.embedding_use_gpu = bool(args.embedding_use_gpu)
|
||||
config.embedding_auto_embed_missing = False
|
||||
config.reranker_backend = reranker_backend
|
||||
config.reranker_model = str(args.reranker_model).strip()
|
||||
config.reranker_use_gpu = bool(args.reranker_use_gpu)
|
||||
|
||||
strategy_runtimes = {
|
||||
spec.strategy_key: _build_strategy_runtime(config, spec)
|
||||
for spec in strategy_specs
|
||||
}
|
||||
|
||||
evaluations: List[QueryEvaluation] = []
|
||||
total_queries = len(dataset)
|
||||
total_runs = total_queries * len(strategy_specs)
|
||||
completed_runs = 0
|
||||
|
||||
try:
|
||||
if int(args.warmup) > 0:
|
||||
warm_query = str(dataset[0]["query"]).strip()
|
||||
warm_relevant_paths = [str(path) for path in dataset[0]["relevant_paths"]]
|
||||
_, warm_relevant, _ = _resolve_expected_paths(source_root, warm_relevant_paths)
|
||||
for spec in strategy_specs:
|
||||
runtime = strategy_runtimes[spec.strategy_key]
|
||||
for _ in range(int(args.warmup)):
|
||||
_run_strategy(
|
||||
runtime.engine,
|
||||
runtime.config,
|
||||
strategy_spec=spec,
|
||||
query=warm_query,
|
||||
source_path=source_root,
|
||||
k=min(int(args.k), 5),
|
||||
coarse_k=min(int(args.coarse_k), 50),
|
||||
relevant=warm_relevant,
|
||||
)
|
||||
|
||||
for index, item in enumerate(dataset, start=1):
|
||||
query = str(item.get("query", "")).strip()
|
||||
if not query:
|
||||
continue
|
||||
print(f"[query {index}/{total_queries}] {query}", flush=True)
|
||||
relevant_paths, relevant, _ = _resolve_expected_paths(
|
||||
source_root,
|
||||
[str(path) for path in item["relevant_paths"]],
|
||||
)
|
||||
runs: Dict[str, StrategyRun] = {}
|
||||
for spec in strategy_specs:
|
||||
if args.progress_output is not None:
|
||||
_write_json_payload(
|
||||
args.progress_output,
|
||||
_make_progress_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
strategy_specs=strategy_specs,
|
||||
evaluations=evaluations,
|
||||
query_index=index - 1,
|
||||
total_queries=total_queries,
|
||||
run_index=completed_runs,
|
||||
total_runs=total_runs,
|
||||
current_query=query,
|
||||
current_strategy_key=spec.strategy_key,
|
||||
),
|
||||
)
|
||||
print(
|
||||
f"[run {completed_runs + 1}/{total_runs}] "
|
||||
f"strategy={spec.strategy_key} query={query}",
|
||||
flush=True,
|
||||
)
|
||||
runtime = strategy_runtimes[spec.strategy_key]
|
||||
runs[spec.strategy_key] = _run_strategy(
|
||||
runtime.engine,
|
||||
runtime.config,
|
||||
strategy_spec=spec,
|
||||
query=query,
|
||||
source_path=source_root,
|
||||
k=int(args.k),
|
||||
coarse_k=int(args.coarse_k),
|
||||
relevant=relevant,
|
||||
)
|
||||
completed_runs += 1
|
||||
run = runs[spec.strategy_key]
|
||||
outcome = "error" if run.error else "ok"
|
||||
print(
|
||||
f"[done {completed_runs}/{total_runs}] "
|
||||
f"strategy={spec.strategy_key} outcome={outcome} "
|
||||
f"latency_ms={run.latency_ms:.2f} "
|
||||
f"first_hit_rank={run.first_hit_rank}",
|
||||
flush=True,
|
||||
)
|
||||
evaluations.append(
|
||||
QueryEvaluation(
|
||||
query=query,
|
||||
intent=str(item.get("intent")) if item.get("intent") is not None else None,
|
||||
notes=str(item.get("notes")) if item.get("notes") is not None else None,
|
||||
relevant_paths=relevant_paths,
|
||||
runs=runs,
|
||||
)
|
||||
)
|
||||
if args.progress_output is not None:
|
||||
_write_json_payload(
|
||||
args.progress_output,
|
||||
_make_progress_payload(
|
||||
args=args,
|
||||
source_root=source_root,
|
||||
strategy_specs=strategy_specs,
|
||||
evaluations=evaluations,
|
||||
query_index=index,
|
||||
total_queries=total_queries,
|
||||
run_index=completed_runs,
|
||||
total_runs=total_runs,
|
||||
current_query=query,
|
||||
current_strategy_key="complete",
|
||||
),
|
||||
)
|
||||
finally:
|
||||
for runtime in strategy_runtimes.values():
|
||||
try:
|
||||
runtime.engine.close()
|
||||
except Exception:
|
||||
pass
|
||||
for runtime in strategy_runtimes.values():
|
||||
try:
|
||||
runtime.registry.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
strategy_summaries: Dict[str, Dict[str, Any]] = {}
|
||||
for spec in strategy_specs:
|
||||
spec_runs = [evaluation.runs[spec.strategy_key] for evaluation in evaluations if spec.strategy_key in evaluation.runs]
|
||||
summary = _summarize_runs(spec_runs)
|
||||
summary["strategy"] = spec.strategy
|
||||
summary["stage2_mode"] = spec.stage2_mode
|
||||
strategy_summaries[spec.strategy_key] = summary
|
||||
|
||||
stage2_mode_matrix = {
|
||||
mode: strategy_summaries[f"staged:{mode}"]
|
||||
for mode in stage2_modes
|
||||
if f"staged:{mode}" in strategy_summaries
|
||||
}
|
||||
pairwise_deltas = [asdict(item) for item in _build_pairwise_deltas(stage2_mode_matrix)]
|
||||
|
||||
payload = {
|
||||
"status": "completed",
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"source": str(source_root),
|
||||
"queries_file": str(args.queries_file),
|
||||
"query_count": len(evaluations),
|
||||
"query_match": args.query_match,
|
||||
"k": int(args.k),
|
||||
"coarse_k": int(args.coarse_k),
|
||||
"local_only": True,
|
||||
"strategies": strategy_summaries,
|
||||
"stage2_mode_matrix": stage2_mode_matrix,
|
||||
"pairwise_stage2_deltas": pairwise_deltas,
|
||||
"config": {
|
||||
"embedding_backend": config.embedding_backend,
|
||||
"embedding_model": config.embedding_model,
|
||||
"embedding_use_gpu": bool(config.embedding_use_gpu),
|
||||
"reranker_backend": config.reranker_backend,
|
||||
"reranker_model": config.reranker_model,
|
||||
"reranker_use_gpu": bool(config.reranker_use_gpu),
|
||||
"enable_staged_rerank": bool(config.enable_staged_rerank),
|
||||
"enable_cross_encoder_rerank": bool(config.enable_cross_encoder_rerank),
|
||||
},
|
||||
"progress_output": str(args.progress_output) if args.progress_output else None,
|
||||
"evaluations": [
|
||||
{
|
||||
"query": evaluation.query,
|
||||
"intent": evaluation.intent,
|
||||
"notes": evaluation.notes,
|
||||
"relevant_paths": evaluation.relevant_paths,
|
||||
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
|
||||
}
|
||||
for evaluation in evaluations
|
||||
],
|
||||
}
|
||||
|
||||
_write_final_outputs(
|
||||
output_path=args.output,
|
||||
progress_output=args.progress_output,
|
||||
payload=payload,
|
||||
)
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -280,8 +280,9 @@ def main() -> None:
|
||||
if args.staged_cluster_strategy:
|
||||
config.staged_clustering_strategy = str(args.staged_cluster_strategy)
|
||||
# Stability: on some Windows setups, fastembed + DirectML can crash under load.
|
||||
# Dense_rerank uses the embedding backend that matches the index; force CPU here.
|
||||
# Force local embeddings and reranking onto CPU for reproducible benchmark runs.
|
||||
config.embedding_use_gpu = False
|
||||
config.reranker_use_gpu = False
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
|
||||
1704
codex-lens/benchmarks/results/ccw_smart_search_stage2.json
Normal file
1704
codex-lens/benchmarks/results/ccw_smart_search_stage2.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,526 @@
|
||||
{
|
||||
"timestamp": "2026-03-14 23:16:55",
|
||||
"source": "D:\\Claude_dms3",
|
||||
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
|
||||
"query_count": 4,
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"local_only": true,
|
||||
"strategies": {
|
||||
"dense_rerank": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 20171.940174996853,
|
||||
"p50_latency_ms": 14222.247749984264,
|
||||
"p95_latency_ms": 35222.31535999476,
|
||||
"errors": 0,
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13679.793299987912,
|
||||
"p50_latency_ms": 12918.63379997015,
|
||||
"p95_latency_ms": 16434.964765003322,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"staged:realtime": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13885.101849973202,
|
||||
"p50_latency_ms": 13826.323699980974,
|
||||
"p95_latency_ms": 14867.712269958853,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13336.124025002122,
|
||||
"p50_latency_ms": 13415.476950019598,
|
||||
"p95_latency_ms": 13514.329230004549,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"stage2_mode_matrix": {
|
||||
"precomputed": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13679.793299987912,
|
||||
"p50_latency_ms": 12918.63379997015,
|
||||
"p95_latency_ms": 16434.964765003322,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"realtime": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13885.101849973202,
|
||||
"p50_latency_ms": 13826.323699980974,
|
||||
"p95_latency_ms": 14867.712269958853,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"static_global_graph": {
|
||||
"query_count": 4,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 13336.124025002122,
|
||||
"p50_latency_ms": 13415.476950019598,
|
||||
"p95_latency_ms": 13514.329230004549,
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"pairwise_stage2_deltas": [
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "realtime",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": -205.30854998528957
|
||||
},
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 343.66927498579025
|
||||
},
|
||||
{
|
||||
"mode_a": "realtime",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 548.9778249710798
|
||||
}
|
||||
],
|
||||
"config": {
|
||||
"embedding_backend": "fastembed",
|
||||
"embedding_model": "code",
|
||||
"embedding_use_gpu": false,
|
||||
"reranker_backend": "onnx",
|
||||
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"enable_staged_rerank": true,
|
||||
"enable_cross_encoder_rerank": true
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": "executeHybridMode dense_rerank semantic smart_search",
|
||||
"intent": "ccw-semantic-routing",
|
||||
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 38829.27079999447,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 16915.833400011063,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 13961.2567999959,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 12986.330999970436,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "parse CodexLens JSON output strip ANSI smart_search",
|
||||
"intent": "ccw-json-fallback",
|
||||
"notes": "Covers JSON/plain-text fallback handling for CodexLens output.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 14782.901199996471,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\codex-lens-lsp.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\queue\\queueexecuteinsession.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-dashboard\\queuepanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usewebsocket.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useflows.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-error-monitoring.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\tests\\native-session-discovery.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\services\\checkpoint-service.ts",
|
||||
"d:\\claude_dms3\\ccw\\tests\\integration\\system-routes.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 13710.042499959469,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 15027.674999952316,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13389.622500002384,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "smart_search init embed search action schema",
|
||||
"intent": "ccw-action-schema",
|
||||
"notes": "Find the Zod schema that defines init/embed/search actions.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 13661.594299972057,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\discovery.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\__tests__\\ask-question.test.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\a2ui\\a2uiwebsockethandler.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\dashboard.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 12127.225099980831,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 12860.084999978542,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13441.331400036812,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": "auto init missing job dedupe smart_search",
|
||||
"intent": "ccw-auto-init",
|
||||
"notes": "Targets background init/embed warmup and dedupe state.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"latency_ms": 13413.994400024414,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\memory-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usememory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\batchoperationtoolbar.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\memory.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useprompthistory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\flowstore.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\services\\deepwiki-service.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\claude-routes.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"latency_ms": 11966.072200000286,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"latency_ms": 13691.39059996605,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"latency_ms": 13527.211199998856,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
|
||||
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,415 @@
|
||||
{
|
||||
"timestamp": "2026-03-15 00:19:16",
|
||||
"source": "D:\\Claude_dms3",
|
||||
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
|
||||
"query_count": 1,
|
||||
"k": 10,
|
||||
"coarse_k": 100,
|
||||
"local_only": true,
|
||||
"strategies": {
|
||||
"auto": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 1.0,
|
||||
"mrr_at_k": 1.0,
|
||||
"avg_recall_at_k": 1.0,
|
||||
"avg_latency_ms": 1377.3565999865532,
|
||||
"p50_latency_ms": 1377.3565999865532,
|
||||
"p95_latency_ms": 1377.3565999865532,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"fts": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "auto",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"fts": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 1.0,
|
||||
"mrr_at_k": 1.0,
|
||||
"avg_recall_at_k": 1.0,
|
||||
"avg_latency_ms": 1460.0819000601768,
|
||||
"p50_latency_ms": 1460.0819000601768,
|
||||
"p95_latency_ms": 1460.0819000601768,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"fts": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "fts",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"hybrid": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 45991.74140000343,
|
||||
"p50_latency_ms": 45991.74140000343,
|
||||
"p95_latency_ms": 45991.74140000343,
|
||||
"avg_generated_artifact_count": 0.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 0,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"hybrid": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "hybrid",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 22739.62610000372,
|
||||
"p50_latency_ms": 22739.62610000372,
|
||||
"p95_latency_ms": 22739.62610000372,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 2.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 1,
|
||||
"effective_methods": {
|
||||
"dense_rerank": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14900.017599999905,
|
||||
"p50_latency_ms": 14900.017599999905,
|
||||
"p95_latency_ms": 14900.017599999905,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"staged:realtime": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14104.314599990845,
|
||||
"p50_latency_ms": 14104.314599990845,
|
||||
"p95_latency_ms": 14104.314599990845,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 11906.852500021458,
|
||||
"p50_latency_ms": 11906.852500021458,
|
||||
"p95_latency_ms": 11906.852500021458,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"stage2_mode_matrix": {
|
||||
"precomputed": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14900.017599999905,
|
||||
"p50_latency_ms": 14900.017599999905,
|
||||
"p95_latency_ms": 14900.017599999905,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed"
|
||||
},
|
||||
"realtime": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 14104.314599990845,
|
||||
"p50_latency_ms": 14104.314599990845,
|
||||
"p95_latency_ms": 14104.314599990845,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime"
|
||||
},
|
||||
"static_global_graph": {
|
||||
"query_count": 1,
|
||||
"hit_at_k": 0.0,
|
||||
"mrr_at_k": 0.0,
|
||||
"avg_recall_at_k": 0.0,
|
||||
"avg_latency_ms": 11906.852500021458,
|
||||
"p50_latency_ms": 11906.852500021458,
|
||||
"p95_latency_ms": 11906.852500021458,
|
||||
"avg_generated_artifact_count": 1.0,
|
||||
"avg_test_file_count": 0.0,
|
||||
"runs_with_generated_artifacts": 1,
|
||||
"runs_with_test_files": 0,
|
||||
"effective_methods": {
|
||||
"staged": 1
|
||||
},
|
||||
"errors": 0,
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph"
|
||||
}
|
||||
},
|
||||
"pairwise_stage2_deltas": [
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "realtime",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 795.7030000090599
|
||||
},
|
||||
{
|
||||
"mode_a": "precomputed",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 2993.165099978447
|
||||
},
|
||||
{
|
||||
"mode_a": "realtime",
|
||||
"mode_b": "static_global_graph",
|
||||
"hit_at_k_delta": 0.0,
|
||||
"mrr_at_k_delta": 0.0,
|
||||
"avg_recall_at_k_delta": 0.0,
|
||||
"avg_latency_ms_delta": 2197.462099969387
|
||||
}
|
||||
],
|
||||
"config": {
|
||||
"embedding_backend": "fastembed",
|
||||
"embedding_model": "code",
|
||||
"embedding_use_gpu": false,
|
||||
"reranker_backend": "onnx",
|
||||
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"reranker_use_gpu": false,
|
||||
"enable_staged_rerank": true,
|
||||
"enable_cross_encoder_rerank": true
|
||||
},
|
||||
"evaluations": [
|
||||
{
|
||||
"query": "executeHybridMode dense_rerank semantic smart_search",
|
||||
"intent": "ccw-semantic-routing",
|
||||
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
|
||||
"relevant_paths": [
|
||||
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"runs": {
|
||||
"auto": {
|
||||
"strategy_key": "auto",
|
||||
"strategy": "auto",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "fts",
|
||||
"execution_method": "fts",
|
||||
"latency_ms": 1377.3565999865532,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"first_hit_rank": 1,
|
||||
"hit_at_k": true,
|
||||
"recall_at_k": 1.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"fts": {
|
||||
"strategy_key": "fts",
|
||||
"strategy": "fts",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "fts",
|
||||
"execution_method": "fts",
|
||||
"latency_ms": 1460.0819000601768,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
|
||||
],
|
||||
"first_hit_rank": 1,
|
||||
"hit_at_k": true,
|
||||
"recall_at_k": 1.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"hybrid": {
|
||||
"strategy_key": "hybrid",
|
||||
"strategy": "hybrid",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "hybrid",
|
||||
"execution_method": "hybrid",
|
||||
"latency_ms": 45991.74140000343,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\config\\litellm-api-config-manager.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\core-memory.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
||||
"d:\\claude_dms3\\codex-lens\\scripts\\generate_embeddings.py",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\notification-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\team-msg.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\remote-notification.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-store.ts",
|
||||
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 0,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"dense_rerank": {
|
||||
"strategy_key": "dense_rerank",
|
||||
"strategy": "dense_rerank",
|
||||
"stage2_mode": null,
|
||||
"effective_method": "dense_rerank",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 22739.62610000372,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 2,
|
||||
"error": null
|
||||
},
|
||||
"staged:precomputed": {
|
||||
"strategy_key": "staged:precomputed",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "precomputed",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 14900.017599999905,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"staged:realtime": {
|
||||
"strategy_key": "staged:realtime",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "realtime",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 14104.314599990845,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
},
|
||||
"staged:static_global_graph": {
|
||||
"strategy_key": "staged:static_global_graph",
|
||||
"strategy": "staged",
|
||||
"stage2_mode": "static_global_graph",
|
||||
"effective_method": "staged",
|
||||
"execution_method": "cascade",
|
||||
"latency_ms": 11906.852500021458,
|
||||
"topk_paths": [
|
||||
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
|
||||
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
|
||||
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
|
||||
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
|
||||
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
|
||||
],
|
||||
"first_hit_rank": null,
|
||||
"hit_at_k": false,
|
||||
"recall_at_k": 0.0,
|
||||
"generated_artifact_count": 1,
|
||||
"test_file_count": 0,
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -57,9 +57,9 @@ semantic-directml = [
|
||||
# Cross-encoder reranking (second-stage, optional)
|
||||
# Install with: pip install codexlens[reranker] (default: ONNX backend)
|
||||
reranker-onnx = [
|
||||
"optimum~=1.16.0",
|
||||
"onnxruntime~=1.15.0",
|
||||
"transformers~=4.36.0",
|
||||
"optimum[onnxruntime]~=2.1.0",
|
||||
"onnxruntime~=1.23.0",
|
||||
"transformers~=4.53.0",
|
||||
]
|
||||
|
||||
# Remote reranking via HTTP API
|
||||
@@ -79,9 +79,9 @@ reranker-legacy = [
|
||||
|
||||
# Backward-compatible alias for default reranker backend
|
||||
reranker = [
|
||||
"optimum~=1.16.0",
|
||||
"onnxruntime~=1.15.0",
|
||||
"transformers~=4.36.0",
|
||||
"optimum[onnxruntime]~=2.1.0",
|
||||
"onnxruntime~=1.23.0",
|
||||
"transformers~=4.53.0",
|
||||
]
|
||||
|
||||
# Encoding detection for non-UTF8 files
|
||||
@@ -116,3 +116,12 @@ package-dir = { "" = "src" }
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
"codexlens.lsp" = ["lsp-servers.json"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
markers = [
|
||||
"integration: marks tests that exercise broader end-to-end or dependency-heavy flows",
|
||||
]
|
||||
filterwarnings = [
|
||||
"ignore:'BaseCommand' is deprecated and will be removed in Click 9.0.*:DeprecationWarning",
|
||||
"ignore:The '__version__' attribute is deprecated and will be removed in Click 9.1.*:DeprecationWarning",
|
||||
]
|
||||
|
||||
340
codex-lens/scripts/bootstrap_reranker_local.py
Normal file
340
codex-lens/scripts/bootstrap_reranker_local.py
Normal file
@@ -0,0 +1,340 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Bootstrap a local-only ONNX reranker environment for CodexLens.
|
||||
|
||||
This script defaults to dry-run output so it can be used as a reproducible
|
||||
bootstrap manifest. When `--apply` is passed, it installs pinned reranker
|
||||
packages into the selected virtual environment and can optionally pre-download
|
||||
the ONNX reranker model into a repo-local Hugging Face cache.
|
||||
|
||||
Examples:
|
||||
python scripts/bootstrap_reranker_local.py --dry-run
|
||||
python scripts/bootstrap_reranker_local.py --apply --download-model
|
||||
python scripts/bootstrap_reranker_local.py --venv .venv --model Xenova/ms-marco-MiniLM-L-12-v2
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
MANIFEST_PATH = Path(__file__).with_name("requirements-reranker-local.txt")
|
||||
DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2"
|
||||
DEFAULT_HF_HOME = PROJECT_ROOT / ".cache" / "huggingface"
|
||||
|
||||
STEP_NOTES = {
|
||||
"runtime": "Install the local ONNX runtime first so optimum/transformers do not backtrack over runtime wheels.",
|
||||
"hf-stack": "Pin the Hugging Face stack used by the ONNX reranker backend.",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RequirementStep:
|
||||
name: str
|
||||
packages: tuple[str, ...]
|
||||
|
||||
|
||||
def _normalize_venv_path(raw_path: str | Path) -> Path:
|
||||
return (Path(raw_path) if raw_path else PROJECT_ROOT / ".venv").expanduser().resolve()
|
||||
|
||||
|
||||
def _venv_python(venv_path: Path) -> Path:
|
||||
if os.name == "nt":
|
||||
return venv_path / "Scripts" / "python.exe"
|
||||
return venv_path / "bin" / "python"
|
||||
|
||||
|
||||
def _venv_huggingface_cli(venv_path: Path) -> Path:
|
||||
if os.name == "nt":
|
||||
preferred = venv_path / "Scripts" / "hf.exe"
|
||||
return preferred if preferred.exists() else venv_path / "Scripts" / "huggingface-cli.exe"
|
||||
preferred = venv_path / "bin" / "hf"
|
||||
return preferred if preferred.exists() else venv_path / "bin" / "huggingface-cli"
|
||||
|
||||
|
||||
def _default_shell() -> str:
|
||||
return "powershell" if os.name == "nt" else "bash"
|
||||
|
||||
|
||||
def _shell_quote(value: str, shell: str) -> str:
|
||||
if shell == "bash":
|
||||
return shlex.quote(value)
|
||||
return "'" + value.replace("'", "''") + "'"
|
||||
|
||||
|
||||
def _format_command(parts: Iterable[str], shell: str) -> str:
|
||||
return " ".join(_shell_quote(str(part), shell) for part in parts)
|
||||
|
||||
|
||||
def _format_set_env(name: str, value: str, shell: str) -> str:
|
||||
quoted_value = _shell_quote(value, shell)
|
||||
if shell == "bash":
|
||||
return f"export {name}={quoted_value}"
|
||||
return f"$env:{name} = {quoted_value}"
|
||||
|
||||
|
||||
def _model_local_dir(hf_home: Path, model_name: str) -> Path:
|
||||
slug = model_name.replace("/", "--")
|
||||
return hf_home / "models" / slug
|
||||
|
||||
|
||||
def _parse_manifest(manifest_path: Path) -> list[RequirementStep]:
|
||||
current_name: str | None = None
|
||||
current_packages: list[str] = []
|
||||
steps: list[RequirementStep] = []
|
||||
|
||||
for raw_line in manifest_path.read_text(encoding="utf-8").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("# [") and line.endswith("]"):
|
||||
if current_name and current_packages:
|
||||
steps.append(RequirementStep(current_name, tuple(current_packages)))
|
||||
current_name = line[3:-1]
|
||||
current_packages = []
|
||||
continue
|
||||
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
|
||||
if current_name is None:
|
||||
raise ValueError(f"Package entry found before a section header in {manifest_path}")
|
||||
current_packages.append(line)
|
||||
|
||||
if current_name and current_packages:
|
||||
steps.append(RequirementStep(current_name, tuple(current_packages)))
|
||||
|
||||
if not steps:
|
||||
raise ValueError(f"No requirement steps found in {manifest_path}")
|
||||
return steps
|
||||
|
||||
|
||||
def _pip_install_command(python_path: Path, packages: Iterable[str]) -> list[str]:
|
||||
return [
|
||||
str(python_path),
|
||||
"-m",
|
||||
"pip",
|
||||
"install",
|
||||
"--upgrade",
|
||||
"--disable-pip-version-check",
|
||||
"--upgrade-strategy",
|
||||
"only-if-needed",
|
||||
"--only-binary=:all:",
|
||||
*packages,
|
||||
]
|
||||
|
||||
|
||||
def _probe_command(python_path: Path) -> list[str]:
|
||||
return [
|
||||
str(python_path),
|
||||
"-c",
|
||||
(
|
||||
"from codexlens.semantic.reranker.factory import check_reranker_available; "
|
||||
"print(check_reranker_available('onnx'))"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _download_command(huggingface_cli: Path, model_name: str, model_dir: Path) -> list[str]:
|
||||
return [
|
||||
str(huggingface_cli),
|
||||
"download",
|
||||
model_name,
|
||||
"--local-dir",
|
||||
str(model_dir),
|
||||
]
|
||||
|
||||
|
||||
def _print_plan(
|
||||
shell: str,
|
||||
venv_path: Path,
|
||||
python_path: Path,
|
||||
huggingface_cli: Path,
|
||||
manifest_path: Path,
|
||||
steps: list[RequirementStep],
|
||||
model_name: str,
|
||||
hf_home: Path,
|
||||
) -> None:
|
||||
model_dir = _model_local_dir(hf_home, model_name)
|
||||
|
||||
print("CodexLens local reranker bootstrap")
|
||||
print(f"manifest: {manifest_path}")
|
||||
print(f"target_venv: {venv_path}")
|
||||
print(f"target_python: {python_path}")
|
||||
print(f"backend: onnx")
|
||||
print(f"model: {model_name}")
|
||||
print(f"hf_home: {hf_home}")
|
||||
print("mode: dry-run")
|
||||
print("notes:")
|
||||
print("- Uses only the selected venv Python; no global pip commands are emitted.")
|
||||
print("- Targets the local ONNX reranker backend only; no API or LiteLLM providers are involved.")
|
||||
print("")
|
||||
print("pinned_steps:")
|
||||
for step in steps:
|
||||
print(f"- {step.name}: {', '.join(step.packages)}")
|
||||
note = STEP_NOTES.get(step.name)
|
||||
if note:
|
||||
print(f" note: {note}")
|
||||
print("")
|
||||
print("commands:")
|
||||
print(
|
||||
"1. "
|
||||
+ _format_command(
|
||||
[
|
||||
str(python_path),
|
||||
"-m",
|
||||
"pip",
|
||||
"install",
|
||||
"--upgrade",
|
||||
"pip",
|
||||
"setuptools",
|
||||
"wheel",
|
||||
],
|
||||
shell,
|
||||
)
|
||||
)
|
||||
command_index = 2
|
||||
for step in steps:
|
||||
print(f"{command_index}. " + _format_command(_pip_install_command(python_path, step.packages), shell))
|
||||
command_index += 1
|
||||
print(f"{command_index}. " + _format_set_env("HF_HOME", str(hf_home), shell))
|
||||
command_index += 1
|
||||
print(f"{command_index}. " + _format_command(_download_command(huggingface_cli, model_name, model_dir), shell))
|
||||
command_index += 1
|
||||
print(f"{command_index}. " + _format_command(_probe_command(python_path), shell))
|
||||
print("")
|
||||
print("optional_runtime_env:")
|
||||
print(_format_set_env("RERANKER_BACKEND", "onnx", shell))
|
||||
print(_format_set_env("RERANKER_MODEL", str(model_dir), shell))
|
||||
print(_format_set_env("HF_HOME", str(hf_home), shell))
|
||||
|
||||
|
||||
def _run_command(command: list[str], *, env: dict[str, str] | None = None) -> None:
|
||||
command_env = os.environ.copy()
|
||||
if env:
|
||||
command_env.update(env)
|
||||
command_env.setdefault("PYTHONUTF8", "1")
|
||||
command_env.setdefault("PYTHONIOENCODING", "utf-8")
|
||||
subprocess.run(command, check=True, env=command_env)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Bootstrap pinned local-only ONNX reranker dependencies for a CodexLens virtual environment.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--venv",
|
||||
type=Path,
|
||||
default=PROJECT_ROOT / ".venv",
|
||||
help="Path to the CodexLens virtual environment (default: ./.venv under codex-lens).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Model repo to pre-download for local reranking (default: {DEFAULT_MODEL}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-home",
|
||||
type=Path,
|
||||
default=DEFAULT_HF_HOME,
|
||||
help="Repo-local Hugging Face cache directory used for optional model downloads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shell",
|
||||
choices=("powershell", "bash"),
|
||||
default=_default_shell(),
|
||||
help="Shell syntax to use when rendering dry-run commands.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Execute the pinned install steps against the selected virtual environment.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-model",
|
||||
action="store_true",
|
||||
help="When used with --apply, pre-download the model into the configured HF_HOME directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--probe",
|
||||
action="store_true",
|
||||
help="When used with --apply, run a small reranker availability probe at the end.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the deterministic bootstrap plan. This is also the default when --apply is omitted.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
steps = _parse_manifest(MANIFEST_PATH)
|
||||
venv_path = _normalize_venv_path(args.venv)
|
||||
python_path = _venv_python(venv_path)
|
||||
huggingface_cli = _venv_huggingface_cli(venv_path)
|
||||
hf_home = args.hf_home.expanduser().resolve()
|
||||
|
||||
if not args.apply:
|
||||
_print_plan(
|
||||
shell=args.shell,
|
||||
venv_path=venv_path,
|
||||
python_path=python_path,
|
||||
huggingface_cli=huggingface_cli,
|
||||
manifest_path=MANIFEST_PATH,
|
||||
steps=steps,
|
||||
model_name=args.model,
|
||||
hf_home=hf_home,
|
||||
)
|
||||
return 0
|
||||
|
||||
if not python_path.exists():
|
||||
print(f"Target venv Python not found: {python_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
_run_command(
|
||||
[
|
||||
str(python_path),
|
||||
"-m",
|
||||
"pip",
|
||||
"install",
|
||||
"--upgrade",
|
||||
"pip",
|
||||
"setuptools",
|
||||
"wheel",
|
||||
]
|
||||
)
|
||||
for step in steps:
|
||||
_run_command(_pip_install_command(python_path, step.packages))
|
||||
|
||||
if args.download_model:
|
||||
if not huggingface_cli.exists():
|
||||
print(f"Expected venv-local Hugging Face CLI not found: {huggingface_cli}", file=sys.stderr)
|
||||
return 1
|
||||
download_env = os.environ.copy()
|
||||
download_env["HF_HOME"] = str(hf_home)
|
||||
hf_home.mkdir(parents=True, exist_ok=True)
|
||||
_run_command(_download_command(huggingface_cli, args.model, _model_local_dir(hf_home, args.model)), env=download_env)
|
||||
|
||||
if args.probe:
|
||||
local_model_dir = _model_local_dir(hf_home, args.model)
|
||||
probe_env = os.environ.copy()
|
||||
probe_env["HF_HOME"] = str(hf_home)
|
||||
probe_env.setdefault("RERANKER_BACKEND", "onnx")
|
||||
probe_env.setdefault("RERANKER_MODEL", str(local_model_dir if local_model_dir.exists() else args.model))
|
||||
_run_command(_probe_command(python_path), env=probe_env)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
13
codex-lens/scripts/requirements-reranker-local.txt
Normal file
13
codex-lens/scripts/requirements-reranker-local.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
# Ordered local ONNX reranker pins for CodexLens.
|
||||
# Validated against the repo-local Python 3.13 virtualenv on Windows.
|
||||
# bootstrap_reranker_local.py installs each section in file order to keep
|
||||
# pip resolver work bounded and repeatable.
|
||||
|
||||
# [runtime]
|
||||
numpy==2.4.0
|
||||
onnxruntime==1.23.2
|
||||
|
||||
# [hf-stack]
|
||||
huggingface-hub==0.36.2
|
||||
transformers==4.53.3
|
||||
optimum[onnxruntime]==2.1.0
|
||||
@@ -2,10 +2,13 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, Iterable, List, Optional
|
||||
|
||||
@@ -22,6 +25,13 @@ from codexlens.storage.registry import RegistryStore, ProjectInfo
|
||||
from codexlens.storage.index_tree import IndexTreeBuilder
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.search.ranking import (
|
||||
QueryIntent,
|
||||
apply_path_penalties,
|
||||
detect_query_intent,
|
||||
query_prefers_lexical_search,
|
||||
query_targets_generated_files,
|
||||
)
|
||||
from codexlens.watcher import WatcherManager, WatcherConfig
|
||||
|
||||
from .output import (
|
||||
@@ -34,6 +44,56 @@ from .output import (
|
||||
)
|
||||
|
||||
app = typer.Typer(help="CodexLens CLI — local code indexing and search.")
|
||||
# Index subcommand group for reorganized commands
|
||||
def _patch_typer_click_help_compat() -> None:
|
||||
"""Patch Typer help rendering for Click versions that pass ctx to make_metavar()."""
|
||||
import click.core
|
||||
from typer.core import TyperArgument
|
||||
|
||||
try:
|
||||
params = inspect.signature(TyperArgument.make_metavar).parameters
|
||||
except (TypeError, ValueError):
|
||||
return
|
||||
|
||||
if len(params) != 1:
|
||||
return
|
||||
|
||||
def _compat_make_metavar(self, ctx=None): # type: ignore[override]
|
||||
if self.metavar is not None:
|
||||
return self.metavar
|
||||
|
||||
var = (self.name or "").upper()
|
||||
if not self.required:
|
||||
var = f"[{var}]"
|
||||
|
||||
try:
|
||||
type_var = self.type.get_metavar(param=self, ctx=ctx)
|
||||
except TypeError:
|
||||
try:
|
||||
type_var = self.type.get_metavar(self, ctx)
|
||||
except TypeError:
|
||||
type_var = self.type.get_metavar(self)
|
||||
|
||||
if type_var:
|
||||
var += f":{type_var}"
|
||||
if self.nargs != 1:
|
||||
var += "..."
|
||||
return var
|
||||
|
||||
TyperArgument.make_metavar = _compat_make_metavar
|
||||
|
||||
param_params = inspect.signature(click.core.Parameter.make_metavar).parameters
|
||||
if len(param_params) == 2:
|
||||
original_param_make_metavar = click.core.Parameter.make_metavar
|
||||
|
||||
def _compat_param_make_metavar(self, ctx=None): # type: ignore[override]
|
||||
return original_param_make_metavar(self, ctx)
|
||||
|
||||
click.core.Parameter.make_metavar = _compat_param_make_metavar
|
||||
|
||||
|
||||
_patch_typer_click_help_compat()
|
||||
|
||||
|
||||
# Index subcommand group for reorganized commands
|
||||
index_app = typer.Typer(help="Index management commands (init, embeddings, binary, status, migrate, all)")
|
||||
@@ -119,6 +179,281 @@ def _extract_embedding_error(embed_result: Dict[str, Any]) -> str:
|
||||
return "Embedding generation failed (no error details provided)"
|
||||
|
||||
|
||||
def _auto_select_search_method(query: str) -> str:
|
||||
"""Choose a default search method from query intent."""
|
||||
if query_targets_generated_files(query) or query_prefers_lexical_search(query):
|
||||
return "fts"
|
||||
|
||||
intent = detect_query_intent(query)
|
||||
if intent == QueryIntent.KEYWORD:
|
||||
return "fts"
|
||||
if intent == QueryIntent.SEMANTIC:
|
||||
return "dense_rerank"
|
||||
return "hybrid"
|
||||
|
||||
|
||||
_CLI_NON_CODE_EXTENSIONS = {
|
||||
"md", "txt", "json", "yaml", "yml", "xml", "csv", "log",
|
||||
"ini", "cfg", "conf", "toml", "env", "properties",
|
||||
"html", "htm", "svg", "png", "jpg", "jpeg", "gif", "ico", "webp",
|
||||
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
||||
"lock", "sum", "mod",
|
||||
}
|
||||
_FALLBACK_ARTIFACT_DIRS = {
|
||||
"dist",
|
||||
"build",
|
||||
"out",
|
||||
"coverage",
|
||||
"htmlcov",
|
||||
".cache",
|
||||
".workflow",
|
||||
".next",
|
||||
".nuxt",
|
||||
".parcel-cache",
|
||||
".turbo",
|
||||
"tmp",
|
||||
"temp",
|
||||
"generated",
|
||||
}
|
||||
_FALLBACK_SOURCE_DIRS = {
|
||||
"src",
|
||||
"lib",
|
||||
"core",
|
||||
"app",
|
||||
"server",
|
||||
"client",
|
||||
"services",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_extension_filters(exclude_extensions: Optional[Iterable[str]]) -> set[str]:
|
||||
"""Normalize extension filters to lowercase values without leading dots."""
|
||||
normalized: set[str] = set()
|
||||
for ext in exclude_extensions or []:
|
||||
cleaned = (ext or "").strip().lower().lstrip(".")
|
||||
if cleaned:
|
||||
normalized.add(cleaned)
|
||||
return normalized
|
||||
|
||||
|
||||
def _score_filesystem_fallback_match(
|
||||
query: str,
|
||||
path_text: str,
|
||||
line_text: str,
|
||||
*,
|
||||
base_score: float,
|
||||
) -> float:
|
||||
"""Score filesystem fallback hits with light source-aware heuristics."""
|
||||
score = max(0.0, float(base_score))
|
||||
if score <= 0:
|
||||
return 0.0
|
||||
|
||||
query_intent = detect_query_intent(query)
|
||||
if query_intent != QueryIntent.KEYWORD:
|
||||
return score
|
||||
|
||||
path_parts = {
|
||||
part.casefold()
|
||||
for part in str(path_text).replace("\\", "/").split("/")
|
||||
if part and part != "."
|
||||
}
|
||||
if _FALLBACK_SOURCE_DIRS.intersection(path_parts):
|
||||
score *= 1.15
|
||||
|
||||
symbol = (query or "").strip()
|
||||
if " " in symbol or not symbol:
|
||||
return score
|
||||
|
||||
escaped_symbol = re.escape(symbol)
|
||||
definition_patterns = (
|
||||
rf"^\s*(?:export\s+)?(?:async\s+)?def\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:async\s+)?function\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?class\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?interface\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?type\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:const|let|var)\s+{escaped_symbol}\b",
|
||||
)
|
||||
if any(re.search(pattern, line_text) for pattern in definition_patterns):
|
||||
score *= 1.8
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _filesystem_fallback_search(
|
||||
query: str,
|
||||
search_path: Path,
|
||||
*,
|
||||
limit: int,
|
||||
config: Config,
|
||||
code_only: bool = False,
|
||||
exclude_extensions: Optional[Iterable[str]] = None,
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""Fallback to ripgrep when indexed keyword search returns no results."""
|
||||
rg_path = shutil.which("rg")
|
||||
if not rg_path or not query.strip():
|
||||
return None
|
||||
|
||||
import time
|
||||
|
||||
allow_generated = query_targets_generated_files(query)
|
||||
ignored_dirs = {name for name in IndexTreeBuilder.IGNORE_DIRS if name}
|
||||
ignored_dirs.add(".workflow")
|
||||
if allow_generated:
|
||||
ignored_dirs.difference_update(_FALLBACK_ARTIFACT_DIRS)
|
||||
|
||||
excluded_exts = _normalize_extension_filters(exclude_extensions)
|
||||
if code_only:
|
||||
excluded_exts.update(_CLI_NON_CODE_EXTENSIONS)
|
||||
|
||||
args = [
|
||||
rg_path,
|
||||
"--json",
|
||||
"--line-number",
|
||||
"--fixed-strings",
|
||||
"--smart-case",
|
||||
"--max-count",
|
||||
"1",
|
||||
]
|
||||
if allow_generated:
|
||||
args.append("--hidden")
|
||||
|
||||
for dirname in sorted(ignored_dirs):
|
||||
args.extend(["--glob", f"!**/{dirname}/**"])
|
||||
|
||||
args.extend([query, str(search_path)])
|
||||
|
||||
start_time = time.perf_counter()
|
||||
proc = subprocess.run(
|
||||
args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
check=False,
|
||||
)
|
||||
|
||||
if proc.returncode not in (0, 1):
|
||||
return None
|
||||
|
||||
matches: List[SearchResult] = []
|
||||
seen_paths: set[str] = set()
|
||||
for raw_line in proc.stdout.splitlines():
|
||||
if len(matches) >= limit:
|
||||
break
|
||||
try:
|
||||
event = json.loads(raw_line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if event.get("type") != "match":
|
||||
continue
|
||||
|
||||
data = event.get("data") or {}
|
||||
path_text = ((data.get("path") or {}).get("text") or "").strip()
|
||||
if not path_text or path_text in seen_paths:
|
||||
continue
|
||||
|
||||
path_obj = Path(path_text)
|
||||
extension = path_obj.suffix.lower().lstrip(".")
|
||||
if extension and extension in excluded_exts:
|
||||
continue
|
||||
if code_only and config.language_for_path(path_obj) is None:
|
||||
continue
|
||||
|
||||
line_text = ((data.get("lines") or {}).get("text") or "").rstrip("\r\n")
|
||||
line_number = data.get("line_number")
|
||||
seen_paths.add(path_text)
|
||||
base_score = float(limit - len(matches))
|
||||
matches.append(
|
||||
SearchResult(
|
||||
path=path_text,
|
||||
score=_score_filesystem_fallback_match(
|
||||
query,
|
||||
path_text,
|
||||
line_text,
|
||||
base_score=base_score,
|
||||
),
|
||||
excerpt=line_text.strip() or line_text or path_text,
|
||||
content=None,
|
||||
metadata={
|
||||
"filesystem_fallback": True,
|
||||
"backend": "ripgrep-fallback",
|
||||
"stale_index_suspected": True,
|
||||
},
|
||||
start_line=line_number,
|
||||
end_line=line_number,
|
||||
)
|
||||
)
|
||||
|
||||
if not matches:
|
||||
return None
|
||||
|
||||
matches = apply_path_penalties(
|
||||
matches,
|
||||
query,
|
||||
test_file_penalty=config.test_file_penalty,
|
||||
generated_file_penalty=config.generated_file_penalty,
|
||||
)
|
||||
return {
|
||||
"results": matches,
|
||||
"time_ms": (time.perf_counter() - start_time) * 1000.0,
|
||||
"fallback": {
|
||||
"backend": "ripgrep-fallback",
|
||||
"stale_index_suspected": True,
|
||||
"reason": "Indexed FTS search returned no results; filesystem fallback used.",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _remove_tree_best_effort(target: Path) -> dict[str, Any]:
|
||||
"""Remove a directory tree without aborting on locked files."""
|
||||
target = target.resolve()
|
||||
if not target.exists():
|
||||
return {
|
||||
"removed": True,
|
||||
"partial": False,
|
||||
"locked_paths": [],
|
||||
"errors": [],
|
||||
"remaining_path": None,
|
||||
}
|
||||
|
||||
locked_paths: List[str] = []
|
||||
errors: List[str] = []
|
||||
entries = sorted(target.rglob("*"), key=lambda path: len(path.parts), reverse=True)
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
if entry.is_dir() and not entry.is_symlink():
|
||||
entry.rmdir()
|
||||
else:
|
||||
entry.unlink()
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except PermissionError:
|
||||
locked_paths.append(str(entry))
|
||||
except OSError as exc:
|
||||
if entry.is_dir():
|
||||
continue
|
||||
errors.append(f"{entry}: {exc}")
|
||||
|
||||
try:
|
||||
target.rmdir()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except PermissionError:
|
||||
locked_paths.append(str(target))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"removed": not target.exists(),
|
||||
"partial": target.exists(),
|
||||
"locked_paths": sorted(set(locked_paths)),
|
||||
"errors": errors,
|
||||
"remaining_path": str(target) if target.exists() else None,
|
||||
}
|
||||
|
||||
|
||||
def _get_index_root() -> Path:
|
||||
"""Get the index root directory from config or default.
|
||||
|
||||
@@ -542,7 +877,7 @@ def search(
|
||||
offset: int = typer.Option(0, "--offset", min=0, help="Pagination offset - skip first N results."),
|
||||
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
|
||||
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
|
||||
method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."),
|
||||
method: str = typer.Option("auto", "--method", "-m", help="Search method: 'auto' (intent-aware, default), 'dense_rerank' (semantic), 'fts' (exact keyword)."),
|
||||
use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
|
||||
code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."),
|
||||
exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."),
|
||||
@@ -576,14 +911,16 @@ def search(
|
||||
Use --depth to limit search recursion (0 = current dir only).
|
||||
|
||||
Search Methods:
|
||||
- dense_rerank (default): Semantic search using Dense embedding coarse retrieval +
|
||||
- auto (default): Intent-aware routing. KEYWORD -> fts, MIXED -> hybrid,
|
||||
SEMANTIC -> dense_rerank.
|
||||
- dense_rerank: Semantic search using Dense embedding coarse retrieval +
|
||||
Cross-encoder reranking. Best for natural language queries and code understanding.
|
||||
- fts: Full-text search using FTS5 (unicode61 tokenizer). Best for exact code
|
||||
identifiers like function/class names. Use --use-fuzzy for typo tolerance.
|
||||
|
||||
Method Selection Guide:
|
||||
- Code identifiers (function/class names): fts
|
||||
- Natural language queries: dense_rerank (default)
|
||||
- Code identifiers (function/class names): auto or fts
|
||||
- Natural language queries: auto or dense_rerank
|
||||
- Typo-tolerant search: fts --use-fuzzy
|
||||
|
||||
Requirements:
|
||||
@@ -591,7 +928,7 @@ def search(
|
||||
Use 'codexlens embeddings-generate' to create embeddings first.
|
||||
|
||||
Examples:
|
||||
# Default semantic search (dense_rerank)
|
||||
# Default intent-aware search
|
||||
codexlens search "authentication logic"
|
||||
|
||||
# Exact code identifier search
|
||||
@@ -612,7 +949,7 @@ def search(
|
||||
|
||||
# Map old mode values to new method values
|
||||
mode_to_method = {
|
||||
"auto": "hybrid",
|
||||
"auto": "auto",
|
||||
"exact": "fts",
|
||||
"fuzzy": "fts", # with use_fuzzy=True
|
||||
"hybrid": "hybrid",
|
||||
@@ -638,19 +975,27 @@ def search(
|
||||
|
||||
# Validate method - simplified interface exposes only dense_rerank and fts
|
||||
# Other methods (vector, hybrid, cascade) are hidden but still work for backward compatibility
|
||||
valid_methods = ["fts", "dense_rerank", "vector", "hybrid", "cascade"]
|
||||
valid_methods = ["auto", "fts", "dense_rerank", "vector", "hybrid", "cascade"]
|
||||
if actual_method not in valid_methods:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Invalid method: {actual_method}. Use 'dense_rerank' (semantic) or 'fts' (exact keyword).")
|
||||
print_json(success=False, error=f"Invalid method: {actual_method}. Use 'auto', 'dense_rerank', or 'fts'.")
|
||||
else:
|
||||
console.print(f"[red]Invalid method:[/red] {actual_method}")
|
||||
console.print("[dim]Use 'dense_rerank' (semantic, default) or 'fts' (exact keyword)[/dim]")
|
||||
console.print("[dim]Use 'auto' (default), 'dense_rerank' (semantic), or 'fts' (exact keyword)[/dim]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
resolved_method = (
|
||||
_auto_select_search_method(query)
|
||||
if actual_method == "auto"
|
||||
else actual_method
|
||||
)
|
||||
display_method = resolved_method
|
||||
execution_method = resolved_method
|
||||
|
||||
# Map dense_rerank to cascade method internally
|
||||
internal_cascade_strategy = cascade_strategy
|
||||
if actual_method == "dense_rerank":
|
||||
actual_method = "cascade"
|
||||
if execution_method == "dense_rerank":
|
||||
execution_method = "cascade"
|
||||
internal_cascade_strategy = "dense_rerank"
|
||||
|
||||
# Validate cascade_strategy if provided (for advanced users)
|
||||
@@ -733,32 +1078,32 @@ def search(
|
||||
# vector: Pure vector semantic search
|
||||
# hybrid: RRF fusion of sparse + dense
|
||||
# cascade: Two-stage binary + dense retrieval
|
||||
if actual_method == "fts":
|
||||
if execution_method == "fts":
|
||||
hybrid_mode = False
|
||||
enable_fuzzy = use_fuzzy
|
||||
enable_vector = False
|
||||
pure_vector = False
|
||||
enable_cascade = False
|
||||
elif actual_method == "vector":
|
||||
elif execution_method == "vector":
|
||||
hybrid_mode = True
|
||||
enable_fuzzy = False
|
||||
enable_vector = True
|
||||
pure_vector = True
|
||||
enable_cascade = False
|
||||
elif actual_method == "hybrid":
|
||||
elif execution_method == "hybrid":
|
||||
hybrid_mode = True
|
||||
enable_fuzzy = use_fuzzy
|
||||
enable_vector = True
|
||||
pure_vector = False
|
||||
enable_cascade = False
|
||||
elif actual_method == "cascade":
|
||||
elif execution_method == "cascade":
|
||||
hybrid_mode = True
|
||||
enable_fuzzy = False
|
||||
enable_vector = True
|
||||
pure_vector = False
|
||||
enable_cascade = True
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {actual_method}")
|
||||
raise ValueError(f"Invalid method: {execution_method}")
|
||||
|
||||
# Parse exclude_extensions from comma-separated string
|
||||
exclude_exts_list = None
|
||||
@@ -790,10 +1135,28 @@ def search(
|
||||
console.print(fp)
|
||||
else:
|
||||
# Dispatch to cascade_search for cascade method
|
||||
if actual_method == "cascade":
|
||||
if execution_method == "cascade":
|
||||
result = engine.cascade_search(query, search_path, k=limit, options=options, strategy=internal_cascade_strategy)
|
||||
else:
|
||||
result = engine.search(query, search_path, options)
|
||||
effective_results = result.results
|
||||
effective_files_matched = result.stats.files_matched
|
||||
effective_time_ms = result.stats.time_ms
|
||||
fallback_payload = None
|
||||
if display_method == "fts" and not use_fuzzy and not effective_results:
|
||||
fallback_payload = _filesystem_fallback_search(
|
||||
query,
|
||||
search_path,
|
||||
limit=limit,
|
||||
config=config,
|
||||
code_only=code_only,
|
||||
exclude_extensions=exclude_exts_list,
|
||||
)
|
||||
if fallback_payload is not None:
|
||||
effective_results = fallback_payload["results"]
|
||||
effective_files_matched = len(effective_results)
|
||||
effective_time_ms = result.stats.time_ms + float(fallback_payload["time_ms"])
|
||||
|
||||
results_list = [
|
||||
{
|
||||
"path": r.path,
|
||||
@@ -803,25 +1166,29 @@ def search(
|
||||
"source": getattr(r, "search_source", None),
|
||||
"symbol": getattr(r, "symbol", None),
|
||||
}
|
||||
for r in result.results
|
||||
for r in effective_results
|
||||
]
|
||||
|
||||
payload = {
|
||||
"query": query,
|
||||
"method": actual_method,
|
||||
"method": display_method,
|
||||
"count": len(results_list),
|
||||
"results": results_list,
|
||||
"stats": {
|
||||
"dirs_searched": result.stats.dirs_searched,
|
||||
"files_matched": result.stats.files_matched,
|
||||
"time_ms": result.stats.time_ms,
|
||||
"files_matched": effective_files_matched,
|
||||
"time_ms": effective_time_ms,
|
||||
},
|
||||
}
|
||||
if fallback_payload is not None:
|
||||
payload["fallback"] = fallback_payload["fallback"]
|
||||
if json_mode:
|
||||
print_json(success=True, result=payload)
|
||||
else:
|
||||
render_search_results(result.results, verbose=verbose)
|
||||
console.print(f"[dim]Method: {actual_method} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
|
||||
render_search_results(effective_results, verbose=verbose)
|
||||
if fallback_payload is not None:
|
||||
console.print("[yellow]No indexed matches found; showing filesystem fallback results (stale index suspected).[/yellow]")
|
||||
console.print(f"[dim]Method: {display_method} | Searched {result.stats.dirs_searched} directories in {effective_time_ms:.1f}ms[/dim]")
|
||||
|
||||
except SearchError as exc:
|
||||
if json_mode:
|
||||
@@ -1454,7 +1821,7 @@ def projects(
|
||||
mapper = PathMapper()
|
||||
index_root = mapper.source_to_index_dir(project_path)
|
||||
if index_root.exists():
|
||||
shutil.rmtree(index_root)
|
||||
_remove_tree_best_effort(index_root)
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result={"removed": str(project_path)})
|
||||
@@ -1966,17 +2333,30 @@ def clean(
|
||||
registry_path.unlink()
|
||||
|
||||
# Remove all indexes
|
||||
shutil.rmtree(index_root)
|
||||
removal = _remove_tree_best_effort(index_root)
|
||||
|
||||
result = {
|
||||
"cleaned": str(index_root),
|
||||
"size_freed_mb": round(total_size / (1024 * 1024), 2),
|
||||
"partial": bool(removal["partial"]),
|
||||
"locked_paths": removal["locked_paths"],
|
||||
"remaining_path": removal["remaining_path"],
|
||||
"errors": removal["errors"],
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result=result)
|
||||
else:
|
||||
console.print(f"[green]Removed all indexes:[/green] {result['size_freed_mb']} MB freed")
|
||||
if result["partial"]:
|
||||
console.print(
|
||||
f"[yellow]Partially removed all indexes:[/yellow] {result['size_freed_mb']} MB freed"
|
||||
)
|
||||
if result["locked_paths"]:
|
||||
console.print(
|
||||
f"[dim]Locked paths left behind: {len(result['locked_paths'])}[/dim]"
|
||||
)
|
||||
else:
|
||||
console.print(f"[green]Removed all indexes:[/green] {result['size_freed_mb']} MB freed")
|
||||
|
||||
elif path:
|
||||
# Remove specific project
|
||||
@@ -2003,18 +2383,29 @@ def clean(
|
||||
registry.close()
|
||||
|
||||
# Remove indexes
|
||||
shutil.rmtree(project_index)
|
||||
removal = _remove_tree_best_effort(project_index)
|
||||
|
||||
result = {
|
||||
"cleaned": str(project_path),
|
||||
"index_path": str(project_index),
|
||||
"size_freed_mb": round(total_size / (1024 * 1024), 2),
|
||||
"partial": bool(removal["partial"]),
|
||||
"locked_paths": removal["locked_paths"],
|
||||
"remaining_path": removal["remaining_path"],
|
||||
"errors": removal["errors"],
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result=result)
|
||||
else:
|
||||
console.print(f"[green]Removed indexes for:[/green] {project_path}")
|
||||
if result["partial"]:
|
||||
console.print(f"[yellow]Partially removed indexes for:[/yellow] {project_path}")
|
||||
if result["locked_paths"]:
|
||||
console.print(
|
||||
f"[dim]Locked paths left behind: {len(result['locked_paths'])}[/dim]"
|
||||
)
|
||||
else:
|
||||
console.print(f"[green]Removed indexes for:[/green] {project_path}")
|
||||
console.print(f" Freed: {result['size_freed_mb']} MB")
|
||||
|
||||
else:
|
||||
@@ -2617,7 +3008,7 @@ def embeddings_status(
|
||||
codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index)
|
||||
"""
|
||||
_deprecated_command_warning("embeddings-status", "index status")
|
||||
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
|
||||
from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status
|
||||
|
||||
# Determine what to check
|
||||
if path is None:
|
||||
@@ -3715,7 +4106,7 @@ def index_status(
|
||||
"""
|
||||
_configure_logging(verbose, json_mode)
|
||||
|
||||
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
|
||||
from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status
|
||||
|
||||
# Determine target path and index root
|
||||
if path is None:
|
||||
@@ -3751,13 +4142,19 @@ def index_status(
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Get embeddings status
|
||||
embeddings_result = get_embedding_stats_summary(index_root)
|
||||
embeddings_result = get_embeddings_status(index_root)
|
||||
embeddings_summary_result = get_embedding_stats_summary(index_root)
|
||||
|
||||
# Build combined result
|
||||
result = {
|
||||
"index_root": str(index_root),
|
||||
"embeddings": embeddings_result.get("result") if embeddings_result.get("success") else None,
|
||||
"embeddings_error": embeddings_result.get("error") if not embeddings_result.get("success") else None,
|
||||
# Keep "embeddings" backward-compatible as the subtree summary payload.
|
||||
"embeddings": embeddings_summary_result.get("result") if embeddings_summary_result.get("success") else None,
|
||||
"embeddings_error": embeddings_summary_result.get("error") if not embeddings_summary_result.get("success") else None,
|
||||
"embeddings_status": embeddings_result.get("result") if embeddings_result.get("success") else None,
|
||||
"embeddings_status_error": embeddings_result.get("error") if not embeddings_result.get("success") else None,
|
||||
"embeddings_summary": embeddings_summary_result.get("result") if embeddings_summary_result.get("success") else None,
|
||||
"embeddings_summary_error": embeddings_summary_result.get("error") if not embeddings_summary_result.get("success") else None,
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
@@ -3770,13 +4167,39 @@ def index_status(
|
||||
console.print("[bold]Dense Embeddings (HNSW):[/bold]")
|
||||
if embeddings_result.get("success"):
|
||||
data = embeddings_result["result"]
|
||||
total = data.get("total_indexes", 0)
|
||||
with_emb = data.get("indexes_with_embeddings", 0)
|
||||
total_chunks = data.get("total_chunks", 0)
|
||||
root = data.get("root") or data
|
||||
subtree = data.get("subtree") or {}
|
||||
centralized = data.get("centralized") or {}
|
||||
|
||||
console.print(f" Total indexes: {total}")
|
||||
console.print(f" Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
|
||||
console.print(f" Total chunks: {total_chunks:,}")
|
||||
console.print(f" Root files: {root.get('total_files', 0)}")
|
||||
console.print(
|
||||
f" Root files with embeddings: "
|
||||
f"[{'green' if root.get('has_embeddings') else 'yellow'}]{root.get('files_with_embeddings', 0)}[/]"
|
||||
f"/{root.get('total_files', 0)}"
|
||||
)
|
||||
console.print(f" Root coverage: {root.get('coverage_percent', 0):.1f}%")
|
||||
console.print(f" Root chunks: {root.get('total_chunks', 0):,}")
|
||||
console.print(f" Root storage mode: {root.get('storage_mode', 'none')}")
|
||||
console.print(
|
||||
f" Centralized dense: "
|
||||
f"{'ready' if centralized.get('dense_ready') else ('present' if centralized.get('dense_index_exists') else 'missing')}"
|
||||
)
|
||||
console.print(
|
||||
f" Centralized binary: "
|
||||
f"{'ready' if centralized.get('binary_ready') else ('present' if centralized.get('binary_index_exists') else 'missing')}"
|
||||
)
|
||||
|
||||
subtree_total = subtree.get("total_indexes", 0)
|
||||
subtree_with_embeddings = subtree.get("indexes_with_embeddings", 0)
|
||||
subtree_chunks = subtree.get("total_chunks", 0)
|
||||
if subtree_total:
|
||||
console.print("\n[bold]Subtree Summary:[/bold]")
|
||||
console.print(f" Total indexes: {subtree_total}")
|
||||
console.print(
|
||||
f" Indexes with embeddings: "
|
||||
f"[{'green' if subtree_with_embeddings > 0 else 'yellow'}]{subtree_with_embeddings}[/]/{subtree_total}"
|
||||
)
|
||||
console.print(f" Total chunks: {subtree_chunks:,}")
|
||||
else:
|
||||
console.print(f" [yellow]--[/yellow] {embeddings_result.get('error', 'Not available')}")
|
||||
|
||||
|
||||
@@ -48,6 +48,8 @@ from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||||
|
||||
from codexlens.storage.index_filters import filter_index_paths
|
||||
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE, is_embedding_backend_available
|
||||
except ImportError:
|
||||
@@ -61,9 +63,15 @@ except ImportError: # pragma: no cover
|
||||
VectorStore = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from codexlens.config import VECTORS_META_DB_NAME
|
||||
from codexlens.config import (
|
||||
BINARY_VECTORS_MMAP_NAME,
|
||||
VECTORS_HNSW_NAME,
|
||||
VECTORS_META_DB_NAME,
|
||||
)
|
||||
except ImportError:
|
||||
VECTORS_HNSW_NAME = "_vectors.hnsw"
|
||||
VECTORS_META_DB_NAME = "_vectors_meta.db"
|
||||
BINARY_VECTORS_MMAP_NAME = "_binary_vectors.mmap"
|
||||
|
||||
try:
|
||||
from codexlens.search.ranking import get_file_category
|
||||
@@ -410,6 +418,98 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
||||
}
|
||||
|
||||
|
||||
def _sqlite_table_exists(conn: sqlite3.Connection, table_name: str) -> bool:
|
||||
"""Return whether a SQLite table exists."""
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
||||
(table_name,),
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
|
||||
def _sqlite_count_rows(conn: sqlite3.Connection, table_name: str) -> int:
|
||||
"""Return row count for a table, or 0 when the table is absent."""
|
||||
if not _sqlite_table_exists(conn, table_name):
|
||||
return 0
|
||||
cursor = conn.execute(f"SELECT COUNT(*) FROM {table_name}")
|
||||
return int(cursor.fetchone()[0] or 0)
|
||||
|
||||
|
||||
def _sqlite_count_distinct_rows(conn: sqlite3.Connection, table_name: str, column_name: str) -> int:
|
||||
"""Return distinct row count for a table column, or 0 when the table is absent."""
|
||||
if not _sqlite_table_exists(conn, table_name):
|
||||
return 0
|
||||
cursor = conn.execute(f"SELECT COUNT(DISTINCT {column_name}) FROM {table_name}")
|
||||
return int(cursor.fetchone()[0] or 0)
|
||||
|
||||
|
||||
def _get_model_info_from_index(index_path: Path) -> Optional[Dict[str, Any]]:
|
||||
"""Read embedding model metadata from an index if available."""
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
if not _sqlite_table_exists(conn, "embeddings_config"):
|
||||
return None
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
with VectorStore(index_path) as vs:
|
||||
config = vs.get_model_config()
|
||||
if not config:
|
||||
return None
|
||||
return {
|
||||
"model_profile": config.get("model_profile"),
|
||||
"model_name": config.get("model_name"),
|
||||
"embedding_dim": config.get("embedding_dim"),
|
||||
"backend": config.get("backend"),
|
||||
"created_at": config.get("created_at"),
|
||||
"updated_at": config.get("updated_at"),
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _inspect_centralized_embeddings(index_root: Path) -> Dict[str, Any]:
|
||||
"""Inspect centralized vector artifacts stored directly at the current root."""
|
||||
dense_index_path = index_root / VECTORS_HNSW_NAME
|
||||
meta_db_path = index_root / VECTORS_META_DB_NAME
|
||||
binary_index_path = index_root / BINARY_VECTORS_MMAP_NAME
|
||||
|
||||
result: Dict[str, Any] = {
|
||||
"index_root": str(index_root),
|
||||
"dense_index_path": str(dense_index_path) if dense_index_path.exists() else None,
|
||||
"binary_index_path": str(binary_index_path) if binary_index_path.exists() else None,
|
||||
"meta_db_path": str(meta_db_path) if meta_db_path.exists() else None,
|
||||
"dense_index_exists": dense_index_path.exists(),
|
||||
"binary_index_exists": binary_index_path.exists(),
|
||||
"meta_db_exists": meta_db_path.exists(),
|
||||
"chunk_metadata_rows": 0,
|
||||
"binary_vector_rows": 0,
|
||||
"files_with_embeddings": 0,
|
||||
"dense_ready": False,
|
||||
"binary_ready": False,
|
||||
"usable": False,
|
||||
}
|
||||
|
||||
if not meta_db_path.exists():
|
||||
return result
|
||||
|
||||
try:
|
||||
with sqlite3.connect(meta_db_path) as conn:
|
||||
result["chunk_metadata_rows"] = _sqlite_count_rows(conn, "chunk_metadata")
|
||||
result["binary_vector_rows"] = _sqlite_count_rows(conn, "binary_vectors")
|
||||
result["files_with_embeddings"] = _sqlite_count_distinct_rows(conn, "chunk_metadata", "file_path")
|
||||
except Exception as exc:
|
||||
result["error"] = f"Failed to inspect centralized metadata: {exc}"
|
||||
return result
|
||||
|
||||
result["dense_ready"] = result["dense_index_exists"] and result["chunk_metadata_rows"] > 0
|
||||
result["binary_ready"] = (
|
||||
result["binary_index_exists"]
|
||||
and result["chunk_metadata_rows"] > 0
|
||||
and result["binary_vector_rows"] > 0
|
||||
)
|
||||
result["usable"] = result["dense_ready"] or result["binary_ready"]
|
||||
return result
|
||||
|
||||
|
||||
def _get_embedding_defaults() -> tuple[str, str, bool, List, str, float]:
|
||||
"""Get default embedding settings from config.
|
||||
|
||||
@@ -1024,7 +1124,7 @@ def _discover_index_dbs_internal(index_root: Path) -> List[Path]:
|
||||
if not index_root.exists():
|
||||
return []
|
||||
|
||||
return sorted(index_root.rglob("_index.db"))
|
||||
return sorted(filter_index_paths(index_root.rglob("_index.db"), index_root))
|
||||
|
||||
|
||||
def build_centralized_binary_vectors_from_existing(
|
||||
@@ -1353,7 +1453,7 @@ def find_all_indexes(scan_dir: Path) -> List[Path]:
|
||||
if not scan_dir.exists():
|
||||
return []
|
||||
|
||||
return list(scan_dir.rglob("_index.db"))
|
||||
return _discover_index_dbs_internal(scan_dir)
|
||||
|
||||
|
||||
|
||||
@@ -1866,8 +1966,32 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
|
||||
Aggregated status with coverage statistics, model info, and timestamps
|
||||
"""
|
||||
index_files = _discover_index_dbs_internal(index_root)
|
||||
centralized = _inspect_centralized_embeddings(index_root)
|
||||
root_index_path = index_root / "_index.db"
|
||||
root_index_exists = root_index_path.exists()
|
||||
|
||||
if not index_files:
|
||||
root_result = {
|
||||
"index_path": str(root_index_path),
|
||||
"exists": root_index_exists,
|
||||
"total_files": 0,
|
||||
"files_with_embeddings": 0,
|
||||
"files_without_embeddings": 0,
|
||||
"total_chunks": 0,
|
||||
"coverage_percent": 0.0,
|
||||
"has_embeddings": False,
|
||||
"storage_mode": "none",
|
||||
}
|
||||
subtree_result = {
|
||||
"total_indexes": 0,
|
||||
"total_files": 0,
|
||||
"files_with_embeddings": 0,
|
||||
"files_without_embeddings": 0,
|
||||
"total_chunks": 0,
|
||||
"coverage_percent": 0.0,
|
||||
"indexes_with_embeddings": 0,
|
||||
"indexes_without_embeddings": 0,
|
||||
}
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
@@ -1880,72 +2004,123 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
|
||||
"indexes_with_embeddings": 0,
|
||||
"indexes_without_embeddings": 0,
|
||||
"model_info": None,
|
||||
"root": root_result,
|
||||
"subtree": subtree_result,
|
||||
"centralized": centralized,
|
||||
},
|
||||
}
|
||||
|
||||
total_files = 0
|
||||
files_with_embeddings = 0
|
||||
total_chunks = 0
|
||||
indexes_with_embeddings = 0
|
||||
model_info = None
|
||||
subtree_total_files = 0
|
||||
subtree_files_with_embeddings = 0
|
||||
subtree_total_chunks = 0
|
||||
subtree_indexes_with_embeddings = 0
|
||||
subtree_model_info = None
|
||||
latest_updated_at = None
|
||||
|
||||
for index_path in index_files:
|
||||
status = check_index_embeddings(index_path)
|
||||
if status["success"]:
|
||||
result = status["result"]
|
||||
total_files += result["total_files"]
|
||||
files_with_embeddings += result["files_with_chunks"]
|
||||
total_chunks += result["total_chunks"]
|
||||
if result["has_embeddings"]:
|
||||
indexes_with_embeddings += 1
|
||||
if not status["success"]:
|
||||
continue
|
||||
|
||||
# Get model config from first index with embeddings (they should all match)
|
||||
if model_info is None:
|
||||
try:
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
with VectorStore(index_path) as vs:
|
||||
config = vs.get_model_config()
|
||||
if config:
|
||||
model_info = {
|
||||
"model_profile": config.get("model_profile"),
|
||||
"model_name": config.get("model_name"),
|
||||
"embedding_dim": config.get("embedding_dim"),
|
||||
"backend": config.get("backend"),
|
||||
"created_at": config.get("created_at"),
|
||||
"updated_at": config.get("updated_at"),
|
||||
}
|
||||
latest_updated_at = config.get("updated_at")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# Track the latest updated_at across all indexes
|
||||
try:
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
with VectorStore(index_path) as vs:
|
||||
config = vs.get_model_config()
|
||||
if config and config.get("updated_at"):
|
||||
if latest_updated_at is None or config["updated_at"] > latest_updated_at:
|
||||
latest_updated_at = config["updated_at"]
|
||||
except Exception:
|
||||
pass
|
||||
result = status["result"]
|
||||
subtree_total_files += result["total_files"]
|
||||
subtree_files_with_embeddings += result["files_with_chunks"]
|
||||
subtree_total_chunks += result["total_chunks"]
|
||||
|
||||
# Update model_info with latest timestamp
|
||||
if model_info and latest_updated_at:
|
||||
model_info["updated_at"] = latest_updated_at
|
||||
if not result["has_embeddings"]:
|
||||
continue
|
||||
|
||||
subtree_indexes_with_embeddings += 1
|
||||
candidate_model_info = _get_model_info_from_index(index_path)
|
||||
if not candidate_model_info:
|
||||
continue
|
||||
if subtree_model_info is None:
|
||||
subtree_model_info = candidate_model_info
|
||||
latest_updated_at = candidate_model_info.get("updated_at")
|
||||
continue
|
||||
candidate_updated_at = candidate_model_info.get("updated_at")
|
||||
if candidate_updated_at and (latest_updated_at is None or candidate_updated_at > latest_updated_at):
|
||||
latest_updated_at = candidate_updated_at
|
||||
|
||||
if subtree_model_info and latest_updated_at:
|
||||
subtree_model_info["updated_at"] = latest_updated_at
|
||||
|
||||
root_total_files = 0
|
||||
root_files_with_embeddings = 0
|
||||
root_total_chunks = 0
|
||||
root_has_embeddings = False
|
||||
root_storage_mode = "none"
|
||||
|
||||
if root_index_exists:
|
||||
root_status = check_index_embeddings(root_index_path)
|
||||
if root_status["success"]:
|
||||
root_data = root_status["result"]
|
||||
root_total_files = int(root_data["total_files"])
|
||||
if root_data["has_embeddings"]:
|
||||
root_files_with_embeddings = int(root_data["files_with_chunks"])
|
||||
root_total_chunks = int(root_data["total_chunks"])
|
||||
root_has_embeddings = True
|
||||
root_storage_mode = "distributed"
|
||||
|
||||
if centralized["usable"]:
|
||||
root_files_with_embeddings = int(centralized["files_with_embeddings"])
|
||||
root_total_chunks = int(centralized["chunk_metadata_rows"])
|
||||
root_has_embeddings = True
|
||||
root_storage_mode = "centralized" if root_storage_mode == "none" else "mixed"
|
||||
|
||||
model_info = None
|
||||
if root_has_embeddings:
|
||||
if root_storage_mode in {"distributed", "mixed"} and root_index_exists:
|
||||
model_info = _get_model_info_from_index(root_index_path)
|
||||
if model_info is None and root_storage_mode in {"centralized", "mixed"}:
|
||||
model_info = subtree_model_info
|
||||
|
||||
root_coverage_percent = round(
|
||||
(root_files_with_embeddings / root_total_files * 100) if root_total_files > 0 else 0,
|
||||
1,
|
||||
)
|
||||
root_files_without_embeddings = max(root_total_files - root_files_with_embeddings, 0)
|
||||
|
||||
root_result = {
|
||||
"index_path": str(root_index_path),
|
||||
"exists": root_index_exists,
|
||||
"total_files": root_total_files,
|
||||
"files_with_embeddings": root_files_with_embeddings,
|
||||
"files_without_embeddings": root_files_without_embeddings,
|
||||
"total_chunks": root_total_chunks,
|
||||
"coverage_percent": root_coverage_percent,
|
||||
"has_embeddings": root_has_embeddings,
|
||||
"storage_mode": root_storage_mode,
|
||||
}
|
||||
subtree_result = {
|
||||
"total_indexes": len(index_files),
|
||||
"total_files": subtree_total_files,
|
||||
"files_with_embeddings": subtree_files_with_embeddings,
|
||||
"files_without_embeddings": subtree_total_files - subtree_files_with_embeddings,
|
||||
"total_chunks": subtree_total_chunks,
|
||||
"coverage_percent": round(
|
||||
(subtree_files_with_embeddings / subtree_total_files * 100) if subtree_total_files > 0 else 0,
|
||||
1,
|
||||
),
|
||||
"indexes_with_embeddings": subtree_indexes_with_embeddings,
|
||||
"indexes_without_embeddings": len(index_files) - subtree_indexes_with_embeddings,
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"total_indexes": len(index_files),
|
||||
"total_files": total_files,
|
||||
"files_with_embeddings": files_with_embeddings,
|
||||
"files_without_embeddings": total_files - files_with_embeddings,
|
||||
"total_chunks": total_chunks,
|
||||
"coverage_percent": round((files_with_embeddings / total_files * 100) if total_files > 0 else 0, 1),
|
||||
"indexes_with_embeddings": indexes_with_embeddings,
|
||||
"indexes_without_embeddings": len(index_files) - indexes_with_embeddings,
|
||||
"total_indexes": 1 if root_index_exists else 0,
|
||||
"total_files": root_total_files,
|
||||
"files_with_embeddings": root_files_with_embeddings,
|
||||
"files_without_embeddings": root_files_without_embeddings,
|
||||
"total_chunks": root_total_chunks,
|
||||
"coverage_percent": root_coverage_percent,
|
||||
"indexes_with_embeddings": 1 if root_has_embeddings else 0,
|
||||
"indexes_without_embeddings": 1 if root_index_exists and not root_has_embeddings else 0,
|
||||
"model_info": model_info,
|
||||
"root": root_result,
|
||||
"subtree": subtree_result,
|
||||
"centralized": centralized,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -126,11 +126,14 @@ class Config:
|
||||
enable_reranking: bool = False
|
||||
reranking_top_k: int = 50
|
||||
symbol_boost_factor: float = 1.5
|
||||
test_file_penalty: float = 0.15 # Penalty for test/fixture paths during final ranking
|
||||
generated_file_penalty: float = 0.35 # Penalty for generated/build artifact paths during final ranking
|
||||
|
||||
# Optional cross-encoder reranking (second stage; requires optional reranker deps)
|
||||
enable_cross_encoder_rerank: bool = False
|
||||
reranker_backend: str = "onnx"
|
||||
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
reranker_use_gpu: bool = True # Whether reranker backends should use GPU acceleration
|
||||
reranker_top_k: int = 50
|
||||
reranker_max_input_tokens: int = 8192 # Maximum tokens for reranker API batching
|
||||
reranker_chunk_type_weights: Optional[Dict[str, float]] = None # Weights for chunk types: {"code": 1.0, "docstring": 0.7}
|
||||
@@ -312,6 +315,7 @@ class Config:
|
||||
"enabled": self.enable_cross_encoder_rerank,
|
||||
"backend": self.reranker_backend,
|
||||
"model": self.reranker_model,
|
||||
"use_gpu": self.reranker_use_gpu,
|
||||
"top_k": self.reranker_top_k,
|
||||
"max_input_tokens": self.reranker_max_input_tokens,
|
||||
"pool_enabled": self.reranker_pool_enabled,
|
||||
@@ -418,6 +422,8 @@ class Config:
|
||||
)
|
||||
if "model" in reranker:
|
||||
self.reranker_model = reranker["model"]
|
||||
if "use_gpu" in reranker:
|
||||
self.reranker_use_gpu = reranker["use_gpu"]
|
||||
if "top_k" in reranker:
|
||||
self.reranker_top_k = reranker["top_k"]
|
||||
if "max_input_tokens" in reranker:
|
||||
@@ -712,6 +718,7 @@ class Config:
|
||||
EMBEDDING_COOLDOWN: Rate limit cooldown for embedding
|
||||
RERANKER_MODEL: Override reranker model
|
||||
RERANKER_BACKEND: Override reranker backend
|
||||
RERANKER_USE_GPU: Override reranker GPU usage (true/false)
|
||||
RERANKER_ENABLED: Override reranker enabled state (true/false)
|
||||
RERANKER_POOL_ENABLED: Enable reranker high availability pool
|
||||
RERANKER_STRATEGY: Load balance strategy for reranker
|
||||
@@ -832,6 +839,11 @@ class Config:
|
||||
else:
|
||||
log.warning("Invalid RERANKER_BACKEND in .env: %r", reranker_backend)
|
||||
|
||||
reranker_use_gpu = get_env("RERANKER_USE_GPU")
|
||||
if reranker_use_gpu:
|
||||
self.reranker_use_gpu = _parse_bool(reranker_use_gpu)
|
||||
log.debug("Overriding reranker_use_gpu from .env: %s", self.reranker_use_gpu)
|
||||
|
||||
reranker_enabled = get_env("RERANKER_ENABLED")
|
||||
if reranker_enabled:
|
||||
value = reranker_enabled.lower()
|
||||
@@ -878,6 +890,25 @@ class Config:
|
||||
except ValueError:
|
||||
log.warning("Invalid RERANKER_TEST_FILE_PENALTY in .env: %r", test_penalty)
|
||||
|
||||
ranking_test_penalty = get_env("TEST_FILE_PENALTY")
|
||||
if ranking_test_penalty:
|
||||
try:
|
||||
self.test_file_penalty = float(ranking_test_penalty)
|
||||
log.debug("Overriding test_file_penalty from .env: %s", self.test_file_penalty)
|
||||
except ValueError:
|
||||
log.warning("Invalid TEST_FILE_PENALTY in .env: %r", ranking_test_penalty)
|
||||
|
||||
generated_penalty = get_env("GENERATED_FILE_PENALTY")
|
||||
if generated_penalty:
|
||||
try:
|
||||
self.generated_file_penalty = float(generated_penalty)
|
||||
log.debug(
|
||||
"Overriding generated_file_penalty from .env: %s",
|
||||
self.generated_file_penalty,
|
||||
)
|
||||
except ValueError:
|
||||
log.warning("Invalid GENERATED_FILE_PENALTY in .env: %r", generated_penalty)
|
||||
|
||||
docstring_weight = get_env("RERANKER_DOCSTRING_WEIGHT")
|
||||
if docstring_weight:
|
||||
try:
|
||||
|
||||
@@ -23,6 +23,7 @@ ENV_VARS = {
|
||||
# Reranker configuration (overrides settings.json)
|
||||
"RERANKER_MODEL": "Reranker model name (overrides settings.json)",
|
||||
"RERANKER_BACKEND": "Reranker backend: fastembed, onnx, api, litellm, legacy",
|
||||
"RERANKER_USE_GPU": "Use GPU for local reranker backends: true/false",
|
||||
"RERANKER_ENABLED": "Enable reranker: true/false",
|
||||
"RERANKER_API_KEY": "API key for reranker service (SiliconFlow/Cohere/Jina)",
|
||||
"RERANKER_API_BASE": "Base URL for reranker API (overrides provider default)",
|
||||
@@ -65,6 +66,9 @@ ENV_VARS = {
|
||||
# Chunking configuration
|
||||
"CHUNK_STRIP_COMMENTS": "Strip comments from code chunks for embedding: true/false (default: true)",
|
||||
"CHUNK_STRIP_DOCSTRINGS": "Strip docstrings from code chunks for embedding: true/false (default: true)",
|
||||
# Search ranking tuning
|
||||
"TEST_FILE_PENALTY": "Penalty for test/fixture paths in final search ranking: 0.0-1.0 (default: 0.15)",
|
||||
"GENERATED_FILE_PENALTY": "Penalty for generated/build artifact paths in final search ranking: 0.0-1.0 (default: 0.35)",
|
||||
# Reranker tuning
|
||||
"RERANKER_TEST_FILE_PENALTY": "Penalty for test files in reranking: 0.0-1.0 (default: 0.0)",
|
||||
"RERANKER_DOCSTRING_WEIGHT": "Weight for docstring chunks in reranking: 0.0-1.0 (default: 1.0)",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@ results via Reciprocal Rank Fusion (RRF) algorithm.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed
|
||||
from contextlib import contextmanager
|
||||
@@ -34,19 +35,21 @@ from codexlens.config import Config
|
||||
from codexlens.config import VECTORS_HNSW_NAME
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.search.ranking import (
|
||||
DEFAULT_WEIGHTS,
|
||||
DEFAULT_WEIGHTS as RANKING_DEFAULT_WEIGHTS,
|
||||
QueryIntent,
|
||||
apply_symbol_boost,
|
||||
cross_encoder_rerank,
|
||||
detect_query_intent,
|
||||
filter_results_by_category,
|
||||
get_rrf_weights,
|
||||
query_prefers_lexical_search,
|
||||
reciprocal_rank_fusion,
|
||||
rerank_results,
|
||||
simple_weighted_fusion,
|
||||
tag_search_source,
|
||||
)
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.index_filters import filter_index_paths
|
||||
|
||||
# Optional LSP imports (for real-time graph expansion)
|
||||
try:
|
||||
@@ -67,8 +70,13 @@ class HybridSearchEngine:
|
||||
default_weights: Default RRF weights for each source
|
||||
"""
|
||||
|
||||
# NOTE: DEFAULT_WEIGHTS imported from ranking.py - single source of truth
|
||||
# FTS + vector hybrid mode (exact: 0.3, fuzzy: 0.1, vector: 0.6)
|
||||
# Public compatibility contract for callers/tests that expect the legacy
|
||||
# three-backend defaults on the engine instance.
|
||||
DEFAULT_WEIGHTS = {
|
||||
"exact": 0.3,
|
||||
"fuzzy": 0.1,
|
||||
"vector": 0.6,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -95,11 +103,172 @@ class HybridSearchEngine:
|
||||
f"Did you mean to pass index_path to search() instead of __init__()?"
|
||||
)
|
||||
|
||||
self.weights = weights or DEFAULT_WEIGHTS.copy()
|
||||
self.weights = weights
|
||||
self._config = config
|
||||
self.embedder = embedder
|
||||
self.reranker: Any = None
|
||||
self._use_gpu = config.embedding_use_gpu if config else True
|
||||
self._centralized_cache_lock = threading.RLock()
|
||||
self._centralized_model_config_cache: Dict[str, Any] = {}
|
||||
self._centralized_embedder_cache: Dict[tuple[Any, ...], Any] = {}
|
||||
self._centralized_ann_cache: Dict[tuple[str, int], Any] = {}
|
||||
self._centralized_query_embedding_cache: Dict[tuple[Any, ...], Any] = {}
|
||||
|
||||
@property
|
||||
def weights(self) -> Dict[str, float]:
|
||||
"""Public/default weights exposed for backwards compatibility."""
|
||||
return dict(self._weights)
|
||||
|
||||
@weights.setter
|
||||
def weights(self, value: Optional[Dict[str, float]]) -> None:
|
||||
"""Update public and internal fusion weights together."""
|
||||
if value is None:
|
||||
public_weights = self.DEFAULT_WEIGHTS.copy()
|
||||
fusion_weights = dict(RANKING_DEFAULT_WEIGHTS)
|
||||
fusion_weights.update(public_weights)
|
||||
else:
|
||||
if not isinstance(value, dict):
|
||||
raise TypeError(f"weights must be a dict, got {type(value).__name__}")
|
||||
public_weights = dict(value)
|
||||
fusion_weights = dict(value)
|
||||
|
||||
self._weights = public_weights
|
||||
self._fusion_weights = fusion_weights
|
||||
|
||||
@staticmethod
|
||||
def _clamp_search_score(score: float) -> float:
|
||||
"""Keep ANN-derived similarity scores within SearchResult's valid domain."""
|
||||
|
||||
return max(0.0, float(score))
|
||||
|
||||
def _get_centralized_model_config(self, index_root: Path) -> Optional[Dict[str, Any]]:
|
||||
"""Load and cache the centralized embedding model config for an index root."""
|
||||
root_key = str(Path(index_root).resolve())
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
if root_key in self._centralized_model_config_cache:
|
||||
cached = self._centralized_model_config_cache[root_key]
|
||||
return dict(cached) if isinstance(cached, dict) else None
|
||||
|
||||
model_config: Optional[Dict[str, Any]] = None
|
||||
try:
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
central_index_path = Path(root_key) / "_index.db"
|
||||
if central_index_path.exists():
|
||||
with VectorStore(central_index_path) as vs:
|
||||
loaded = vs.get_model_config()
|
||||
if isinstance(loaded, dict):
|
||||
model_config = dict(loaded)
|
||||
self.logger.debug(
|
||||
"Loaded model config from centralized index: %s",
|
||||
model_config,
|
||||
)
|
||||
except Exception as exc:
|
||||
self.logger.debug(
|
||||
"Failed to load model config from centralized index: %s",
|
||||
exc,
|
||||
)
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
self._centralized_model_config_cache[root_key] = (
|
||||
dict(model_config) if isinstance(model_config, dict) else None
|
||||
)
|
||||
|
||||
return dict(model_config) if isinstance(model_config, dict) else None
|
||||
|
||||
def _get_centralized_embedder(
|
||||
self,
|
||||
model_config: Optional[Dict[str, Any]],
|
||||
) -> tuple[Any, int, tuple[Any, ...]]:
|
||||
"""Resolve and cache the embedder used for centralized vector search."""
|
||||
from codexlens.semantic.factory import get_embedder
|
||||
|
||||
backend = "fastembed"
|
||||
model_name: Optional[str] = None
|
||||
model_profile = "code"
|
||||
use_gpu = bool(self._use_gpu)
|
||||
embedding_dim: Optional[int] = None
|
||||
|
||||
if model_config:
|
||||
backend = str(model_config.get("backend", "fastembed") or "fastembed")
|
||||
model_name = model_config.get("model_name")
|
||||
model_profile = str(model_config.get("model_profile", "code") or "code")
|
||||
raw_dim = model_config.get("embedding_dim")
|
||||
embedding_dim = int(raw_dim) if raw_dim else None
|
||||
|
||||
if backend == "litellm":
|
||||
embedder_key: tuple[Any, ...] = ("litellm", model_name or "", None)
|
||||
else:
|
||||
embedder_key = ("fastembed", model_profile, use_gpu)
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
cached = self._centralized_embedder_cache.get(embedder_key)
|
||||
if cached is None:
|
||||
if backend == "litellm":
|
||||
cached = get_embedder(backend="litellm", model=model_name)
|
||||
else:
|
||||
cached = get_embedder(
|
||||
backend="fastembed",
|
||||
profile=model_profile,
|
||||
use_gpu=use_gpu,
|
||||
)
|
||||
with self._centralized_cache_lock:
|
||||
existing = self._centralized_embedder_cache.get(embedder_key)
|
||||
if existing is None:
|
||||
self._centralized_embedder_cache[embedder_key] = cached
|
||||
else:
|
||||
cached = existing
|
||||
|
||||
if embedding_dim is None:
|
||||
embedding_dim = int(getattr(cached, "embedding_dim", 0) or 0)
|
||||
|
||||
return cached, embedding_dim, embedder_key
|
||||
|
||||
def _get_centralized_ann_index(self, index_root: Path, dim: int) -> Any:
|
||||
"""Load and cache a centralized ANN index for repeated searches."""
|
||||
from codexlens.semantic.ann_index import ANNIndex
|
||||
|
||||
resolved_root = Path(index_root).resolve()
|
||||
cache_key = (str(resolved_root), int(dim))
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
cached = self._centralized_ann_cache.get(cache_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
ann_index = ANNIndex.create_central(index_root=resolved_root, dim=int(dim))
|
||||
if not ann_index.load():
|
||||
return None
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
existing = self._centralized_ann_cache.get(cache_key)
|
||||
if existing is None:
|
||||
self._centralized_ann_cache[cache_key] = ann_index
|
||||
return ann_index
|
||||
return existing
|
||||
|
||||
def _get_cached_query_embedding(
|
||||
self,
|
||||
query: str,
|
||||
embedder: Any,
|
||||
embedder_key: tuple[Any, ...],
|
||||
) -> Any:
|
||||
"""Cache repeated query embeddings for the same embedder settings."""
|
||||
cache_key = embedder_key + (query,)
|
||||
|
||||
with self._centralized_cache_lock:
|
||||
cached = self._centralized_query_embedding_cache.get(cache_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
query_embedding = embedder.embed_single(query)
|
||||
with self._centralized_cache_lock:
|
||||
existing = self._centralized_query_embedding_cache.get(cache_key)
|
||||
if existing is None:
|
||||
self._centralized_query_embedding_cache[cache_key] = query_embedding
|
||||
return query_embedding
|
||||
return existing
|
||||
|
||||
def search(
|
||||
self,
|
||||
@@ -154,6 +323,7 @@ class HybridSearchEngine:
|
||||
|
||||
# Detect query intent early for category filtering at index level
|
||||
query_intent = detect_query_intent(query)
|
||||
lexical_priority_query = query_prefers_lexical_search(query)
|
||||
# Map intent to category for vector search:
|
||||
# - KEYWORD (code intent) -> filter to 'code' only
|
||||
# - SEMANTIC (doc intent) -> no filter (allow docs to surface)
|
||||
@@ -182,11 +352,11 @@ class HybridSearchEngine:
|
||||
backends["exact"] = True
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
if enable_vector and not lexical_priority_query:
|
||||
backends["vector"] = True
|
||||
|
||||
# Add LSP graph expansion if requested and available
|
||||
if enable_lsp_graph and HAS_LSP:
|
||||
if enable_lsp_graph and HAS_LSP and not lexical_priority_query:
|
||||
backends["lsp_graph"] = True
|
||||
elif enable_lsp_graph and not HAS_LSP:
|
||||
self.logger.warning(
|
||||
@@ -214,7 +384,7 @@ class HybridSearchEngine:
|
||||
# Filter weights to only active backends
|
||||
active_weights = {
|
||||
source: weight
|
||||
for source, weight in self.weights.items()
|
||||
for source, weight in self._fusion_weights.items()
|
||||
if source in results_map
|
||||
}
|
||||
|
||||
@@ -247,10 +417,16 @@ class HybridSearchEngine:
|
||||
)
|
||||
|
||||
# Optional: embedding-based reranking on top results
|
||||
if self._config is not None and self._config.enable_reranking:
|
||||
if (
|
||||
self._config is not None
|
||||
and self._config.enable_reranking
|
||||
and not lexical_priority_query
|
||||
):
|
||||
with timer("reranking", self.logger):
|
||||
if self.embedder is None:
|
||||
self.embedder = self._get_reranking_embedder()
|
||||
with self._centralized_cache_lock:
|
||||
if self.embedder is None:
|
||||
self.embedder = self._get_reranking_embedder()
|
||||
fused_results = rerank_results(
|
||||
query,
|
||||
fused_results[:100],
|
||||
@@ -267,10 +443,13 @@ class HybridSearchEngine:
|
||||
self._config is not None
|
||||
and self._config.enable_reranking
|
||||
and self._config.enable_cross_encoder_rerank
|
||||
and not lexical_priority_query
|
||||
):
|
||||
with timer("cross_encoder_rerank", self.logger):
|
||||
if self.reranker is None:
|
||||
self.reranker = self._get_cross_encoder_reranker()
|
||||
with self._centralized_cache_lock:
|
||||
if self.reranker is None:
|
||||
self.reranker = self._get_cross_encoder_reranker()
|
||||
if self.reranker is not None:
|
||||
fused_results = cross_encoder_rerank(
|
||||
query,
|
||||
@@ -363,11 +542,18 @@ class HybridSearchEngine:
|
||||
|
||||
device: str | None = None
|
||||
kwargs: dict[str, Any] = {}
|
||||
reranker_use_gpu = bool(
|
||||
getattr(
|
||||
self._config,
|
||||
"reranker_use_gpu",
|
||||
getattr(self._config, "embedding_use_gpu", True),
|
||||
)
|
||||
)
|
||||
|
||||
if backend == "onnx":
|
||||
kwargs["use_gpu"] = bool(getattr(self._config, "embedding_use_gpu", True))
|
||||
kwargs["use_gpu"] = reranker_use_gpu
|
||||
elif backend == "legacy":
|
||||
if not bool(getattr(self._config, "embedding_use_gpu", True)):
|
||||
if not reranker_use_gpu:
|
||||
device = "cpu"
|
||||
elif backend == "api":
|
||||
# Pass max_input_tokens for adaptive batching
|
||||
@@ -573,60 +759,16 @@ class HybridSearchEngine:
|
||||
List of SearchResult objects ordered by semantic similarity
|
||||
"""
|
||||
try:
|
||||
import sqlite3
|
||||
import json
|
||||
from codexlens.semantic.factory import get_embedder
|
||||
from codexlens.semantic.ann_index import ANNIndex
|
||||
|
||||
# Get model config from the first index database we can find
|
||||
# (all indexes should use the same embedding model)
|
||||
index_root = hnsw_path.parent
|
||||
model_config = None
|
||||
|
||||
# Try to get model config from the centralized index root first
|
||||
# (not the sub-directory index_path, which may have outdated config)
|
||||
try:
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
central_index_path = index_root / "_index.db"
|
||||
if central_index_path.exists():
|
||||
with VectorStore(central_index_path) as vs:
|
||||
model_config = vs.get_model_config()
|
||||
self.logger.debug(
|
||||
"Loaded model config from centralized index: %s",
|
||||
model_config
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.debug("Failed to load model config from centralized index: %s", e)
|
||||
|
||||
# Detect dimension from HNSW file if model config not found
|
||||
model_config = self._get_centralized_model_config(index_root)
|
||||
if model_config is None:
|
||||
self.logger.debug("Model config not found, will detect from HNSW index")
|
||||
# Create a temporary ANNIndex to load and detect dimension
|
||||
# We need to know the dimension to properly load the index
|
||||
|
||||
# Get embedder based on model config or default
|
||||
if model_config:
|
||||
backend = model_config.get("backend", "fastembed")
|
||||
model_name = model_config["model_name"]
|
||||
model_profile = model_config["model_profile"]
|
||||
embedding_dim = model_config["embedding_dim"]
|
||||
|
||||
if backend == "litellm":
|
||||
embedder = get_embedder(backend="litellm", model=model_name)
|
||||
else:
|
||||
embedder = get_embedder(backend="fastembed", profile=model_profile)
|
||||
else:
|
||||
# Default to code profile
|
||||
embedder = get_embedder(backend="fastembed", profile="code")
|
||||
embedding_dim = embedder.embedding_dim
|
||||
self.logger.debug("Model config not found, will detect from cached embedder")
|
||||
embedder, embedding_dim, embedder_key = self._get_centralized_embedder(model_config)
|
||||
|
||||
# Load centralized ANN index
|
||||
start_load = time.perf_counter()
|
||||
ann_index = ANNIndex.create_central(
|
||||
index_root=index_root,
|
||||
dim=embedding_dim,
|
||||
)
|
||||
if not ann_index.load():
|
||||
ann_index = self._get_centralized_ann_index(index_root=index_root, dim=embedding_dim)
|
||||
if ann_index is None:
|
||||
self.logger.warning("Failed to load centralized vector index from %s", hnsw_path)
|
||||
return []
|
||||
self.logger.debug(
|
||||
@@ -637,7 +779,7 @@ class HybridSearchEngine:
|
||||
|
||||
# Generate query embedding
|
||||
start_embed = time.perf_counter()
|
||||
query_embedding = embedder.embed_single(query)
|
||||
query_embedding = self._get_cached_query_embedding(query, embedder, embedder_key)
|
||||
self.logger.debug(
|
||||
"[TIMING] query_embedding: %.2fms",
|
||||
(time.perf_counter() - start_embed) * 1000
|
||||
@@ -658,7 +800,7 @@ class HybridSearchEngine:
|
||||
return []
|
||||
|
||||
# Convert distances to similarity scores (for cosine: score = 1 - distance)
|
||||
scores = [1.0 - d for d in distances]
|
||||
scores = [self._clamp_search_score(1.0 - d) for d in distances]
|
||||
|
||||
# Fetch chunk metadata from semantic_chunks tables
|
||||
# We need to search across all _index.db files in the project
|
||||
@@ -755,7 +897,7 @@ class HybridSearchEngine:
|
||||
start_line = row.get("start_line")
|
||||
end_line = row.get("end_line")
|
||||
|
||||
score = score_map.get(chunk_id, 0.0)
|
||||
score = self._clamp_search_score(score_map.get(chunk_id, 0.0))
|
||||
|
||||
# Build excerpt
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
@@ -818,7 +960,7 @@ class HybridSearchEngine:
|
||||
import json
|
||||
|
||||
# Find all _index.db files
|
||||
index_files = list(index_root.rglob("_index.db"))
|
||||
index_files = filter_index_paths(index_root.rglob("_index.db"), index_root)
|
||||
|
||||
results = []
|
||||
found_ids = set()
|
||||
@@ -870,7 +1012,7 @@ class HybridSearchEngine:
|
||||
metadata_json = row["metadata"]
|
||||
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||
|
||||
score = score_map.get(chunk_id, 0.0)
|
||||
score = self._clamp_search_score(score_map.get(chunk_id, 0.0))
|
||||
|
||||
# Build excerpt
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
@@ -6,6 +6,7 @@ for combining results from heterogeneous search backends (exact FTS, fuzzy FTS,
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import math
|
||||
from enum import Enum
|
||||
@@ -14,6 +15,8 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
from codexlens.entities import SearchResult, AdditionalLocation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Default RRF weights for hybrid search
|
||||
DEFAULT_WEIGHTS = {
|
||||
@@ -32,6 +35,229 @@ class QueryIntent(str, Enum):
|
||||
MIXED = "mixed"
|
||||
|
||||
|
||||
_TEST_QUERY_RE = re.compile(
|
||||
r"\b(test|tests|spec|specs|fixture|fixtures|benchmark|benchmarks)\b",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
_AUXILIARY_QUERY_RE = re.compile(
|
||||
r"\b(example|examples|demo|demos|sample|samples|debug|benchmark|benchmarks|profile|profiling)\b",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
_ARTIFACT_QUERY_RE = re.compile(
|
||||
r"(?<!\w)(dist|build|out|coverage|htmlcov|generated|bundle|compiled|artifact|artifacts|\.workflow)(?!\w)",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
_ENV_STYLE_QUERY_RE = re.compile(r"\b[A-Z][A-Z0-9]+(?:_[A-Z0-9]+)+\b")
|
||||
_AUXILIARY_DIR_NAMES = frozenset(
|
||||
{
|
||||
"example",
|
||||
"examples",
|
||||
"demo",
|
||||
"demos",
|
||||
"sample",
|
||||
"samples",
|
||||
"benchmark",
|
||||
"benchmarks",
|
||||
"profile",
|
||||
"profiles",
|
||||
}
|
||||
)
|
||||
_GENERATED_DIR_NAMES = frozenset(
|
||||
{
|
||||
"dist",
|
||||
"build",
|
||||
"out",
|
||||
"coverage",
|
||||
"htmlcov",
|
||||
".cache",
|
||||
".workflow",
|
||||
".next",
|
||||
".nuxt",
|
||||
".parcel-cache",
|
||||
".turbo",
|
||||
"tmp",
|
||||
"temp",
|
||||
"generated",
|
||||
}
|
||||
)
|
||||
_GENERATED_FILE_SUFFIXES = (
|
||||
".generated.ts",
|
||||
".generated.tsx",
|
||||
".generated.js",
|
||||
".generated.jsx",
|
||||
".generated.py",
|
||||
".gen.ts",
|
||||
".gen.tsx",
|
||||
".gen.js",
|
||||
".gen.jsx",
|
||||
".min.js",
|
||||
".min.css",
|
||||
".bundle.js",
|
||||
".bundle.css",
|
||||
)
|
||||
_SOURCE_DIR_NAMES = frozenset(
|
||||
{
|
||||
"src",
|
||||
"lib",
|
||||
"core",
|
||||
"app",
|
||||
"server",
|
||||
"client",
|
||||
"services",
|
||||
}
|
||||
)
|
||||
_IDENTIFIER_QUERY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
||||
_TOPIC_TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z0-9]*")
|
||||
_EXPLICIT_PATH_HINT_MARKER_RE = re.compile(r"[_\-/\\.]")
|
||||
_SEMANTIC_QUERY_STOPWORDS = frozenset(
|
||||
{
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"can",
|
||||
"to",
|
||||
"of",
|
||||
"in",
|
||||
"for",
|
||||
"on",
|
||||
"with",
|
||||
"at",
|
||||
"by",
|
||||
"from",
|
||||
"as",
|
||||
"into",
|
||||
"through",
|
||||
"and",
|
||||
"but",
|
||||
"if",
|
||||
"or",
|
||||
"not",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"it",
|
||||
"its",
|
||||
"how",
|
||||
"what",
|
||||
"where",
|
||||
"when",
|
||||
"why",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
}
|
||||
)
|
||||
_PATH_TOPIC_STOPWORDS = frozenset(
|
||||
{
|
||||
*_SOURCE_DIR_NAMES,
|
||||
*_AUXILIARY_DIR_NAMES,
|
||||
*_GENERATED_DIR_NAMES,
|
||||
"tool",
|
||||
"tools",
|
||||
"util",
|
||||
"utils",
|
||||
"test",
|
||||
"tests",
|
||||
"spec",
|
||||
"specs",
|
||||
"fixture",
|
||||
"fixtures",
|
||||
"index",
|
||||
"main",
|
||||
"ts",
|
||||
"tsx",
|
||||
"js",
|
||||
"jsx",
|
||||
"mjs",
|
||||
"cjs",
|
||||
"py",
|
||||
"java",
|
||||
"go",
|
||||
"rs",
|
||||
"rb",
|
||||
"php",
|
||||
"cs",
|
||||
"cpp",
|
||||
"cc",
|
||||
"c",
|
||||
"h",
|
||||
}
|
||||
)
|
||||
_LEXICAL_PRIORITY_SURFACE_TOKENS = frozenset(
|
||||
{
|
||||
"config",
|
||||
"configs",
|
||||
"configuration",
|
||||
"configurations",
|
||||
"setting",
|
||||
"settings",
|
||||
"backend",
|
||||
"backends",
|
||||
"environment",
|
||||
"env",
|
||||
"variable",
|
||||
"variables",
|
||||
"factory",
|
||||
"factories",
|
||||
"override",
|
||||
"overrides",
|
||||
"option",
|
||||
"options",
|
||||
"flag",
|
||||
"flags",
|
||||
"mode",
|
||||
"modes",
|
||||
}
|
||||
)
|
||||
_LEXICAL_PRIORITY_FOCUS_TOKENS = frozenset(
|
||||
{
|
||||
"embedding",
|
||||
"embeddings",
|
||||
"reranker",
|
||||
"rerankers",
|
||||
"onnx",
|
||||
"api",
|
||||
"litellm",
|
||||
"fastembed",
|
||||
"local",
|
||||
"legacy",
|
||||
"stage",
|
||||
"stage2",
|
||||
"stage3",
|
||||
"stage4",
|
||||
"precomputed",
|
||||
"realtime",
|
||||
"static",
|
||||
"global",
|
||||
"graph",
|
||||
"selection",
|
||||
"model",
|
||||
"models",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def normalize_weights(weights: Dict[str, float | None]) -> Dict[str, float | None]:
|
||||
"""Normalize weights to sum to 1.0 (best-effort)."""
|
||||
total = sum(float(v) for v in weights.values() if v is not None)
|
||||
@@ -66,6 +292,7 @@ def detect_query_intent(query: str) -> QueryIntent:
|
||||
has_code_signals = bool(
|
||||
re.search(r"(::|->|\.)", trimmed)
|
||||
or re.search(r"[A-Z][a-z]+[A-Z]", trimmed)
|
||||
or re.search(r"\b[a-z]+[A-Z][A-Za-z0-9_]*\b", trimmed)
|
||||
or re.search(r"\b\w+_\w+\b", trimmed)
|
||||
or re.search(
|
||||
r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b",
|
||||
@@ -119,6 +346,56 @@ def get_rrf_weights(
|
||||
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
|
||||
|
||||
|
||||
def query_targets_test_files(query: str) -> bool:
|
||||
"""Return True when the query explicitly targets tests/spec fixtures."""
|
||||
return bool(_TEST_QUERY_RE.search((query or "").strip()))
|
||||
|
||||
|
||||
def query_targets_generated_files(query: str) -> bool:
|
||||
"""Return True when the query explicitly targets generated/build artifacts."""
|
||||
return bool(_ARTIFACT_QUERY_RE.search((query or "").strip()))
|
||||
|
||||
|
||||
def query_targets_auxiliary_files(query: str) -> bool:
|
||||
"""Return True when the query explicitly targets examples, benchmarks, or debug files."""
|
||||
return bool(_AUXILIARY_QUERY_RE.search((query or "").strip()))
|
||||
|
||||
|
||||
def query_prefers_lexical_search(query: str) -> bool:
|
||||
"""Return True when config/env/factory style queries are safer with lexical-first search."""
|
||||
trimmed = (query or "").strip()
|
||||
if not trimmed:
|
||||
return False
|
||||
|
||||
if _ENV_STYLE_QUERY_RE.search(trimmed):
|
||||
return True
|
||||
|
||||
query_tokens = set(_semantic_query_topic_tokens(trimmed))
|
||||
if not query_tokens:
|
||||
return False
|
||||
|
||||
if query_tokens.intersection({"factory", "factories"}):
|
||||
return True
|
||||
|
||||
if query_tokens.intersection({"environment", "env"}) and query_tokens.intersection({"variable", "variables"}):
|
||||
return True
|
||||
|
||||
if "backend" in query_tokens and query_tokens.intersection(
|
||||
{"embedding", "embeddings", "reranker", "rerankers", "onnx", "api", "litellm", "fastembed", "local", "legacy"}
|
||||
):
|
||||
return True
|
||||
|
||||
surface_hits = query_tokens.intersection(_LEXICAL_PRIORITY_SURFACE_TOKENS)
|
||||
focus_hits = query_tokens.intersection(_LEXICAL_PRIORITY_FOCUS_TOKENS)
|
||||
return bool(surface_hits and focus_hits)
|
||||
|
||||
|
||||
def _normalized_path_parts(path: str) -> List[str]:
|
||||
"""Normalize a path string into casefolded components for heuristics."""
|
||||
normalized = (path or "").replace("\\", "/")
|
||||
return [part.casefold() for part in normalized.split("/") if part and part != "."]
|
||||
|
||||
|
||||
# File extensions to category mapping for fast lookup
|
||||
_EXT_TO_CATEGORY: Dict[str, str] = {
|
||||
# Code extensions
|
||||
@@ -196,6 +473,482 @@ def filter_results_by_category(
|
||||
return filtered
|
||||
|
||||
|
||||
def is_test_file(path: str) -> bool:
|
||||
"""Return True when a path clearly refers to a test/spec file."""
|
||||
parts = _normalized_path_parts(path)
|
||||
if not parts:
|
||||
return False
|
||||
basename = parts[-1]
|
||||
return (
|
||||
basename.startswith("test_")
|
||||
or basename.endswith("_test.py")
|
||||
or basename.endswith(".test.ts")
|
||||
or basename.endswith(".test.tsx")
|
||||
or basename.endswith(".test.js")
|
||||
or basename.endswith(".test.jsx")
|
||||
or basename.endswith(".spec.ts")
|
||||
or basename.endswith(".spec.tsx")
|
||||
or basename.endswith(".spec.js")
|
||||
or basename.endswith(".spec.jsx")
|
||||
or "tests" in parts[:-1]
|
||||
or "test" in parts[:-1]
|
||||
or "__fixtures__" in parts[:-1]
|
||||
or "fixtures" in parts[:-1]
|
||||
)
|
||||
|
||||
|
||||
def is_generated_artifact_path(path: str) -> bool:
|
||||
"""Return True when a path clearly points at generated/build artifacts."""
|
||||
parts = _normalized_path_parts(path)
|
||||
if not parts:
|
||||
return False
|
||||
basename = parts[-1]
|
||||
return any(part in _GENERATED_DIR_NAMES for part in parts[:-1]) or basename.endswith(
|
||||
_GENERATED_FILE_SUFFIXES
|
||||
)
|
||||
|
||||
|
||||
def is_auxiliary_reference_path(path: str) -> bool:
|
||||
"""Return True for examples, benchmarks, demos, and debug helper files."""
|
||||
parts = _normalized_path_parts(path)
|
||||
if not parts:
|
||||
return False
|
||||
basename = parts[-1]
|
||||
if any(part in _AUXILIARY_DIR_NAMES for part in parts[:-1]):
|
||||
return True
|
||||
return (
|
||||
basename.startswith("debug_")
|
||||
or basename.startswith("benchmark")
|
||||
or basename.startswith("profile_")
|
||||
or "_benchmark" in basename
|
||||
or "_profile" in basename
|
||||
)
|
||||
|
||||
|
||||
def _extract_identifier_query(query: str) -> Optional[str]:
|
||||
"""Return a single-token identifier query when definition boosting is safe."""
|
||||
trimmed = (query or "").strip()
|
||||
if not trimmed or " " in trimmed:
|
||||
return None
|
||||
if not _IDENTIFIER_QUERY_RE.fullmatch(trimmed):
|
||||
return None
|
||||
return trimmed
|
||||
|
||||
|
||||
def extract_explicit_path_hints(query: str) -> List[List[str]]:
|
||||
"""Extract explicit path/file hints from separator-style query tokens.
|
||||
|
||||
Natural-language queries often contain one or two high-signal feature/file
|
||||
hints such as ``smart_search`` or ``smart-search.ts`` alongside broader
|
||||
platform words like ``CodexLens``. These hints should be treated as more
|
||||
specific than the surrounding prose.
|
||||
"""
|
||||
hints: List[List[str]] = []
|
||||
seen: set[tuple[str, ...]] = set()
|
||||
for raw_part in re.split(r"\s+", query or ""):
|
||||
candidate = raw_part.strip().strip("\"'`()[]{}<>:,;")
|
||||
if not candidate or not _EXPLICIT_PATH_HINT_MARKER_RE.search(candidate):
|
||||
continue
|
||||
tokens = [
|
||||
token
|
||||
for token in _split_identifier_like_tokens(candidate)
|
||||
if token not in _PATH_TOPIC_STOPWORDS
|
||||
]
|
||||
if len(tokens) < 2:
|
||||
continue
|
||||
key = tuple(tokens)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
hints.append(list(key))
|
||||
return hints
|
||||
|
||||
|
||||
def _is_source_implementation_path(path: str) -> bool:
|
||||
"""Return True when a path looks like an implementation file under a source dir."""
|
||||
parts = _normalized_path_parts(path)
|
||||
if not parts:
|
||||
return False
|
||||
return any(part in _SOURCE_DIR_NAMES for part in parts[:-1])
|
||||
|
||||
|
||||
def _result_text_candidates(result: SearchResult) -> List[str]:
|
||||
"""Collect short text snippets that may contain a symbol definition."""
|
||||
candidates: List[str] = []
|
||||
for text in (result.excerpt, result.content):
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
continue
|
||||
for line in text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped:
|
||||
candidates.append(stripped)
|
||||
if len(candidates) >= 6:
|
||||
break
|
||||
if len(candidates) >= 6:
|
||||
break
|
||||
|
||||
symbol_name = result.symbol_name
|
||||
if not symbol_name and result.symbol is not None:
|
||||
symbol_name = getattr(result.symbol, "name", None)
|
||||
if isinstance(symbol_name, str) and symbol_name.strip():
|
||||
candidates.append(symbol_name.strip())
|
||||
return candidates
|
||||
|
||||
|
||||
def _result_defines_identifier(result: SearchResult, symbol: str) -> bool:
|
||||
"""Best-effort check for whether a result snippet looks like a symbol definition."""
|
||||
escaped_symbol = re.escape(symbol)
|
||||
definition_patterns = (
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?(?:async\s+)?def\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?class\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?interface\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?type\s+{escaped_symbol}\b",
|
||||
rf"^\s*(?:export\s+)?(?:default\s+)?(?:const|let|var)\s+{escaped_symbol}\b",
|
||||
rf"^\s*{escaped_symbol}\s*=\s*(?:async\s+)?\(",
|
||||
rf"^\s*{escaped_symbol}\s*=\s*(?:async\s+)?[^=]*=>",
|
||||
)
|
||||
for candidate in _result_text_candidates(result):
|
||||
if any(re.search(pattern, candidate) for pattern in definition_patterns):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _split_identifier_like_tokens(text: str) -> List[str]:
|
||||
"""Split identifier-like text into normalized word tokens."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
tokens: List[str] = []
|
||||
for raw_token in _TOPIC_TOKEN_RE.findall(text):
|
||||
expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", raw_token)
|
||||
expanded = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", expanded)
|
||||
for token in expanded.split():
|
||||
normalized = _normalize_topic_token(token)
|
||||
if normalized:
|
||||
tokens.append(normalized)
|
||||
return tokens
|
||||
|
||||
|
||||
def _normalize_topic_token(token: str) -> Optional[str]:
|
||||
"""Normalize lightweight topic tokens for query/path overlap heuristics."""
|
||||
normalized = (token or "").casefold()
|
||||
if len(normalized) < 2 or normalized.isdigit():
|
||||
return None
|
||||
if len(normalized) > 4 and normalized.endswith("ies"):
|
||||
normalized = f"{normalized[:-3]}y"
|
||||
elif len(normalized) > 3 and normalized.endswith("s") and not normalized.endswith("ss"):
|
||||
normalized = normalized[:-1]
|
||||
return normalized or None
|
||||
|
||||
|
||||
def _dedupe_preserve_order(tokens: List[str]) -> List[str]:
|
||||
"""Deduplicate tokens while preserving the first-seen order."""
|
||||
deduped: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for token in tokens:
|
||||
if token in seen:
|
||||
continue
|
||||
seen.add(token)
|
||||
deduped.append(token)
|
||||
return deduped
|
||||
|
||||
|
||||
def _semantic_query_topic_tokens(query: str) -> List[str]:
|
||||
"""Extract salient natural-language tokens for lightweight topic matching."""
|
||||
tokens = [
|
||||
token
|
||||
for token in _split_identifier_like_tokens(query)
|
||||
if token not in _SEMANTIC_QUERY_STOPWORDS
|
||||
]
|
||||
return _dedupe_preserve_order(tokens)
|
||||
|
||||
|
||||
def _path_topic_tokens(path: str) -> tuple[List[str], List[str]]:
|
||||
"""Extract normalized topic tokens from a path and its basename."""
|
||||
parts = _normalized_path_parts(path)
|
||||
if not parts:
|
||||
return [], []
|
||||
|
||||
path_tokens: List[str] = []
|
||||
basename_tokens: List[str] = []
|
||||
last_index = len(parts) - 1
|
||||
for index, part in enumerate(parts):
|
||||
target = basename_tokens if index == last_index else path_tokens
|
||||
for token in _split_identifier_like_tokens(part):
|
||||
if token in _PATH_TOPIC_STOPWORDS:
|
||||
continue
|
||||
target.append(token)
|
||||
return _dedupe_preserve_order(path_tokens), _dedupe_preserve_order(basename_tokens)
|
||||
|
||||
|
||||
def _source_path_topic_boost(
|
||||
query: str,
|
||||
path: str,
|
||||
query_intent: QueryIntent,
|
||||
) -> tuple[float, List[str]]:
|
||||
"""Return a path/topic boost when a query strongly overlaps a source path."""
|
||||
query_tokens = _semantic_query_topic_tokens(query)
|
||||
if len(query_tokens) < 2:
|
||||
return 1.0, []
|
||||
|
||||
path_tokens, basename_tokens = _path_topic_tokens(path)
|
||||
if not path_tokens and not basename_tokens:
|
||||
return 1.0, []
|
||||
|
||||
path_token_set = set(path_tokens) | set(basename_tokens)
|
||||
basename_overlap = [token for token in query_tokens if token in basename_tokens]
|
||||
all_overlap = [token for token in query_tokens if token in path_token_set]
|
||||
explicit_hint_tokens = extract_explicit_path_hints(query)
|
||||
|
||||
for hint_tokens in explicit_hint_tokens:
|
||||
if basename_tokens == hint_tokens:
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
return 4.5, hint_tokens[:3]
|
||||
return 2.4, hint_tokens[:3]
|
||||
if all(token in basename_tokens for token in hint_tokens):
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
return 4.5, hint_tokens[:3]
|
||||
return 1.6, hint_tokens[:3]
|
||||
|
||||
if query_prefers_lexical_search(query):
|
||||
lexical_surface_overlap = [
|
||||
token for token in basename_tokens if token in query_tokens and token in _LEXICAL_PRIORITY_SURFACE_TOKENS
|
||||
]
|
||||
if lexical_surface_overlap:
|
||||
lexical_overlap = lexical_surface_overlap[:3]
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
return 5.5, lexical_overlap
|
||||
return 5.0, lexical_overlap
|
||||
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
if len(basename_overlap) >= 2:
|
||||
# Multi-token identifier-style queries often name the feature/file directly.
|
||||
# Give basename matches a stronger lift so they can survive workspace fan-out.
|
||||
multiplier = min(4.5, 2.0 + 1.25 * float(len(basename_overlap)))
|
||||
return multiplier, basename_overlap[:3]
|
||||
if len(all_overlap) >= 3:
|
||||
multiplier = min(2.0, 1.1 + 0.2 * len(all_overlap))
|
||||
return multiplier, all_overlap[:3]
|
||||
return 1.0, []
|
||||
|
||||
if len(basename_overlap) >= 2:
|
||||
multiplier = min(1.45, 1.15 + 0.1 * len(basename_overlap))
|
||||
return multiplier, basename_overlap[:3]
|
||||
if len(all_overlap) >= 3:
|
||||
multiplier = min(1.3, 1.05 + 0.05 * len(all_overlap))
|
||||
return multiplier, all_overlap[:3]
|
||||
return 1.0, []
|
||||
|
||||
|
||||
def apply_path_penalties(
|
||||
results: List[SearchResult],
|
||||
query: str,
|
||||
*,
|
||||
test_file_penalty: float = 0.15,
|
||||
generated_file_penalty: float = 0.35,
|
||||
) -> List[SearchResult]:
|
||||
"""Apply lightweight path-based penalties to reduce noisy rankings."""
|
||||
if not results or (test_file_penalty <= 0 and generated_file_penalty <= 0):
|
||||
return results
|
||||
|
||||
query_intent = detect_query_intent(query)
|
||||
skip_test_penalty = query_targets_test_files(query)
|
||||
skip_auxiliary_penalty = query_targets_auxiliary_files(query)
|
||||
skip_generated_penalty = query_targets_generated_files(query)
|
||||
query_topic_tokens = _semantic_query_topic_tokens(query)
|
||||
keyword_path_query = query_intent == QueryIntent.KEYWORD and len(query_topic_tokens) >= 2
|
||||
explicit_feature_query = bool(extract_explicit_path_hints(query))
|
||||
source_oriented_query = (
|
||||
explicit_feature_query
|
||||
or keyword_path_query
|
||||
or (
|
||||
query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED}
|
||||
and len(query_topic_tokens) >= 2
|
||||
)
|
||||
)
|
||||
identifier_query = None
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
identifier_query = _extract_identifier_query(query)
|
||||
effective_test_penalty = float(test_file_penalty)
|
||||
if effective_test_penalty > 0 and not skip_test_penalty:
|
||||
if query_intent == QueryIntent.KEYWORD:
|
||||
# Identifier-style queries should prefer implementation files over test references.
|
||||
effective_test_penalty = max(effective_test_penalty, 0.35)
|
||||
elif query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED}:
|
||||
# Natural-language code queries should still prefer implementation files over references.
|
||||
effective_test_penalty = max(effective_test_penalty, 0.25)
|
||||
if explicit_feature_query:
|
||||
# Explicit feature/file hints should be even more biased toward source implementations.
|
||||
effective_test_penalty = max(effective_test_penalty, 0.45)
|
||||
effective_auxiliary_penalty = effective_test_penalty
|
||||
if effective_auxiliary_penalty > 0 and not skip_auxiliary_penalty and explicit_feature_query:
|
||||
# Examples/benchmarks are usually descriptive noise for feature-targeted implementation queries.
|
||||
effective_auxiliary_penalty = max(effective_auxiliary_penalty, 0.5)
|
||||
effective_generated_penalty = float(generated_file_penalty)
|
||||
if effective_generated_penalty > 0 and not skip_generated_penalty:
|
||||
if source_oriented_query:
|
||||
effective_generated_penalty = max(effective_generated_penalty, 0.45)
|
||||
if explicit_feature_query:
|
||||
effective_generated_penalty = max(effective_generated_penalty, 0.6)
|
||||
|
||||
penalized: List[SearchResult] = []
|
||||
for result in results:
|
||||
multiplier = 1.0
|
||||
penalty_multiplier = 1.0
|
||||
boost_multiplier = 1.0
|
||||
penalty_reasons: List[str] = []
|
||||
boost_reasons: List[str] = []
|
||||
|
||||
if effective_test_penalty > 0 and not skip_test_penalty and is_test_file(result.path):
|
||||
penalty_multiplier *= max(0.0, 1.0 - effective_test_penalty)
|
||||
penalty_reasons.append("test_file")
|
||||
|
||||
if (
|
||||
effective_auxiliary_penalty > 0
|
||||
and not skip_auxiliary_penalty
|
||||
and not is_test_file(result.path)
|
||||
and is_auxiliary_reference_path(result.path)
|
||||
):
|
||||
penalty_multiplier *= max(0.0, 1.0 - effective_auxiliary_penalty)
|
||||
penalty_reasons.append("auxiliary_file")
|
||||
|
||||
if (
|
||||
effective_generated_penalty > 0
|
||||
and not skip_generated_penalty
|
||||
and is_generated_artifact_path(result.path)
|
||||
):
|
||||
penalty_multiplier *= max(0.0, 1.0 - effective_generated_penalty)
|
||||
penalty_reasons.append("generated_artifact")
|
||||
|
||||
if (
|
||||
identifier_query
|
||||
and not is_test_file(result.path)
|
||||
and not is_generated_artifact_path(result.path)
|
||||
and _result_defines_identifier(result, identifier_query)
|
||||
):
|
||||
if _is_source_implementation_path(result.path):
|
||||
boost_multiplier *= 2.0
|
||||
boost_reasons.append("source_definition")
|
||||
else:
|
||||
boost_multiplier *= 1.35
|
||||
boost_reasons.append("symbol_definition")
|
||||
|
||||
if (
|
||||
(query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} or keyword_path_query)
|
||||
and not skip_test_penalty
|
||||
and not skip_auxiliary_penalty
|
||||
and not skip_generated_penalty
|
||||
and not is_test_file(result.path)
|
||||
and not is_generated_artifact_path(result.path)
|
||||
and not is_auxiliary_reference_path(result.path)
|
||||
and _is_source_implementation_path(result.path)
|
||||
):
|
||||
semantic_path_boost, overlap_tokens = _source_path_topic_boost(
|
||||
query,
|
||||
result.path,
|
||||
query_intent,
|
||||
)
|
||||
if semantic_path_boost > 1.0:
|
||||
boost_multiplier *= semantic_path_boost
|
||||
boost_reasons.append("source_path_topic_overlap")
|
||||
|
||||
multiplier = penalty_multiplier * boost_multiplier
|
||||
if penalty_reasons or boost_reasons:
|
||||
metadata = {
|
||||
**result.metadata,
|
||||
"path_rank_multiplier": multiplier,
|
||||
}
|
||||
if penalty_reasons:
|
||||
metadata["path_penalty_reasons"] = penalty_reasons
|
||||
metadata["path_penalty_multiplier"] = penalty_multiplier
|
||||
if boost_reasons:
|
||||
metadata["path_boost_reasons"] = boost_reasons
|
||||
metadata["path_boost_multiplier"] = boost_multiplier
|
||||
if "source_path_topic_overlap" in boost_reasons and overlap_tokens:
|
||||
metadata["path_boost_overlap_tokens"] = overlap_tokens
|
||||
penalized.append(
|
||||
result.model_copy(
|
||||
deep=True,
|
||||
update={
|
||||
"score": max(0.0, float(result.score) * multiplier),
|
||||
"metadata": metadata,
|
||||
},
|
||||
)
|
||||
)
|
||||
else:
|
||||
penalized.append(result)
|
||||
|
||||
penalized.sort(key=lambda r: r.score, reverse=True)
|
||||
return penalized
|
||||
|
||||
|
||||
def rebalance_noisy_results(
|
||||
results: List[SearchResult],
|
||||
query: str,
|
||||
) -> List[SearchResult]:
|
||||
"""Move noisy test/generated/auxiliary results behind implementation hits when safe."""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
query_intent = detect_query_intent(query)
|
||||
skip_test_penalty = query_targets_test_files(query)
|
||||
skip_auxiliary_penalty = query_targets_auxiliary_files(query)
|
||||
skip_generated_penalty = query_targets_generated_files(query)
|
||||
query_topic_tokens = _semantic_query_topic_tokens(query)
|
||||
keyword_path_query = query_intent == QueryIntent.KEYWORD and len(query_topic_tokens) >= 2
|
||||
explicit_feature_query = bool(extract_explicit_path_hints(query))
|
||||
source_oriented_query = (
|
||||
explicit_feature_query
|
||||
or keyword_path_query
|
||||
or (
|
||||
query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED}
|
||||
and len(query_topic_tokens) >= 2
|
||||
)
|
||||
)
|
||||
if not source_oriented_query:
|
||||
return results
|
||||
|
||||
max_generated_results = len(results) if skip_generated_penalty else 0
|
||||
max_test_results = len(results) if skip_test_penalty else (0 if explicit_feature_query else 1)
|
||||
max_auxiliary_results = len(results) if skip_auxiliary_penalty else (0 if explicit_feature_query else 1)
|
||||
|
||||
selected: List[SearchResult] = []
|
||||
deferred: List[SearchResult] = []
|
||||
generated_count = 0
|
||||
test_count = 0
|
||||
auxiliary_count = 0
|
||||
|
||||
for result in results:
|
||||
if not skip_generated_penalty and is_generated_artifact_path(result.path):
|
||||
if generated_count >= max_generated_results:
|
||||
deferred.append(result)
|
||||
continue
|
||||
generated_count += 1
|
||||
selected.append(result)
|
||||
continue
|
||||
|
||||
if not skip_test_penalty and is_test_file(result.path):
|
||||
if test_count >= max_test_results:
|
||||
deferred.append(result)
|
||||
continue
|
||||
test_count += 1
|
||||
selected.append(result)
|
||||
continue
|
||||
|
||||
if not skip_auxiliary_penalty and is_auxiliary_reference_path(result.path):
|
||||
if auxiliary_count >= max_auxiliary_results:
|
||||
deferred.append(result)
|
||||
continue
|
||||
auxiliary_count += 1
|
||||
selected.append(result)
|
||||
continue
|
||||
|
||||
selected.append(result)
|
||||
|
||||
return selected + deferred
|
||||
|
||||
|
||||
def simple_weighted_fusion(
|
||||
results_map: Dict[str, List[SearchResult]],
|
||||
weights: Dict[str, float] = None,
|
||||
@@ -633,10 +1386,16 @@ def cross_encoder_rerank(
|
||||
raw_scores = reranker.predict(pairs, batch_size=int(batch_size))
|
||||
else:
|
||||
return results
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
logger.debug("Cross-encoder rerank failed; returning original ranking: %s", exc)
|
||||
return results
|
||||
|
||||
if not raw_scores or len(raw_scores) != rerank_count:
|
||||
logger.debug(
|
||||
"Cross-encoder rerank returned %d scores for %d candidates; returning original ranking",
|
||||
len(raw_scores) if raw_scores else 0,
|
||||
rerank_count,
|
||||
)
|
||||
return results
|
||||
|
||||
scores = [float(s) for s in raw_scores]
|
||||
@@ -653,26 +1412,13 @@ def cross_encoder_rerank(
|
||||
else:
|
||||
probs = [sigmoid(s) for s in scores]
|
||||
|
||||
query_intent = detect_query_intent(query)
|
||||
skip_test_penalty = query_targets_test_files(query)
|
||||
skip_auxiliary_penalty = query_targets_auxiliary_files(query)
|
||||
skip_generated_penalty = query_targets_generated_files(query)
|
||||
keyword_path_query = query_intent == QueryIntent.KEYWORD and len(_semantic_query_topic_tokens(query)) >= 2
|
||||
reranked_results: List[SearchResult] = []
|
||||
|
||||
# Helper to detect test files
|
||||
def is_test_file(path: str) -> bool:
|
||||
if not path:
|
||||
return False
|
||||
basename = path.split("/")[-1].split("\\")[-1]
|
||||
return (
|
||||
basename.startswith("test_") or
|
||||
basename.endswith("_test.py") or
|
||||
basename.endswith(".test.ts") or
|
||||
basename.endswith(".test.js") or
|
||||
basename.endswith(".spec.ts") or
|
||||
basename.endswith(".spec.js") or
|
||||
"/tests/" in path or
|
||||
"\\tests\\" in path or
|
||||
"/test/" in path or
|
||||
"\\test\\" in path
|
||||
)
|
||||
|
||||
for idx, result in enumerate(results):
|
||||
if idx < rerank_count:
|
||||
prev_score = float(result.score)
|
||||
@@ -699,6 +1445,52 @@ def cross_encoder_rerank(
|
||||
if test_file_penalty > 0 and is_test_file(result.path):
|
||||
combined_score = combined_score * (1.0 - test_file_penalty)
|
||||
|
||||
cross_encoder_floor_reason = None
|
||||
cross_encoder_floor_score = None
|
||||
cross_encoder_floor_overlap_tokens: List[str] = []
|
||||
if (
|
||||
(query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} or keyword_path_query)
|
||||
and not skip_test_penalty
|
||||
and not skip_auxiliary_penalty
|
||||
and not skip_generated_penalty
|
||||
and not is_test_file(result.path)
|
||||
and not is_generated_artifact_path(result.path)
|
||||
and not is_auxiliary_reference_path(result.path)
|
||||
and _is_source_implementation_path(result.path)
|
||||
):
|
||||
semantic_path_boost, overlap_tokens = _source_path_topic_boost(
|
||||
query,
|
||||
result.path,
|
||||
query_intent,
|
||||
)
|
||||
if semantic_path_boost > 1.0:
|
||||
floor_ratio = 0.8 if semantic_path_boost >= 1.35 else 0.75
|
||||
candidate_floor = prev_score * floor_ratio
|
||||
if candidate_floor > combined_score:
|
||||
combined_score = candidate_floor
|
||||
cross_encoder_floor_reason = (
|
||||
"keyword_source_path_overlap"
|
||||
if query_intent == QueryIntent.KEYWORD
|
||||
else "semantic_source_path_overlap"
|
||||
)
|
||||
cross_encoder_floor_score = candidate_floor
|
||||
cross_encoder_floor_overlap_tokens = overlap_tokens
|
||||
|
||||
metadata = {
|
||||
**result.metadata,
|
||||
"pre_cross_encoder_score": prev_score,
|
||||
"cross_encoder_score": ce_score,
|
||||
"cross_encoder_prob": ce_prob,
|
||||
"cross_encoder_reranked": True,
|
||||
}
|
||||
if cross_encoder_floor_reason is not None:
|
||||
metadata["cross_encoder_floor_reason"] = cross_encoder_floor_reason
|
||||
metadata["cross_encoder_floor_score"] = cross_encoder_floor_score
|
||||
if cross_encoder_floor_overlap_tokens:
|
||||
metadata["cross_encoder_floor_overlap_tokens"] = (
|
||||
cross_encoder_floor_overlap_tokens
|
||||
)
|
||||
|
||||
reranked_results.append(
|
||||
SearchResult(
|
||||
path=result.path,
|
||||
@@ -707,13 +1499,7 @@ def cross_encoder_rerank(
|
||||
content=result.content,
|
||||
symbol=result.symbol,
|
||||
chunk=result.chunk,
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"pre_cross_encoder_score": prev_score,
|
||||
"cross_encoder_score": ce_score,
|
||||
"cross_encoder_prob": ce_prob,
|
||||
"cross_encoder_reranked": True,
|
||||
},
|
||||
metadata=metadata,
|
||||
start_line=result.start_line,
|
||||
end_line=result.end_line,
|
||||
symbol_name=result.symbol_name,
|
||||
|
||||
@@ -383,8 +383,37 @@ class ANNIndex:
|
||||
if self._index is None or self._current_count == 0:
|
||||
return [], [] # Empty index
|
||||
|
||||
# Perform kNN search
|
||||
labels, distances = self._index.knn_query(query, k=top_k)
|
||||
effective_k = min(max(int(top_k), 0), self._current_count)
|
||||
if effective_k == 0:
|
||||
return [], []
|
||||
|
||||
try:
|
||||
self._index.set_ef(max(self.ef, effective_k))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
while True:
|
||||
try:
|
||||
labels, distances = self._index.knn_query(query, k=effective_k)
|
||||
break
|
||||
except Exception as exc:
|
||||
if "contiguous 2D array" in str(exc) and effective_k > 1:
|
||||
next_k = max(1, effective_k // 2)
|
||||
logger.debug(
|
||||
"ANN search knn_query failed for k=%d; retrying with k=%d: %s",
|
||||
effective_k,
|
||||
next_k,
|
||||
exc,
|
||||
)
|
||||
if next_k == effective_k:
|
||||
raise
|
||||
effective_k = next_k
|
||||
try:
|
||||
self._index.set_ef(max(self.ef, effective_k))
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
raise
|
||||
|
||||
# Convert to lists and flatten (knn_query returns 2D arrays)
|
||||
ids = labels[0].tolist()
|
||||
|
||||
@@ -15,7 +15,7 @@ def check_reranker_available(backend: str) -> tuple[bool, str | None]:
|
||||
|
||||
Notes:
|
||||
- "fastembed" uses fastembed TextCrossEncoder (pip install fastembed>=0.4.0). [Recommended]
|
||||
- "onnx" redirects to "fastembed" for backward compatibility.
|
||||
- "onnx" uses Optimum + ONNX Runtime (pip install onnxruntime optimum[onnxruntime] transformers).
|
||||
- "legacy" uses sentence-transformers CrossEncoder (pip install codexlens[reranker-legacy]).
|
||||
- "api" uses a remote reranking HTTP API (requires httpx).
|
||||
- "litellm" uses `ccw-litellm` for unified access to LLM providers.
|
||||
@@ -33,10 +33,9 @@ def check_reranker_available(backend: str) -> tuple[bool, str | None]:
|
||||
return check_fastembed_reranker_available()
|
||||
|
||||
if backend == "onnx":
|
||||
# Redirect to fastembed for backward compatibility
|
||||
from .fastembed_reranker import check_fastembed_reranker_available
|
||||
from .onnx_reranker import check_onnx_reranker_available
|
||||
|
||||
return check_fastembed_reranker_available()
|
||||
return check_onnx_reranker_available()
|
||||
|
||||
if backend == "litellm":
|
||||
try:
|
||||
@@ -66,7 +65,7 @@ def check_reranker_available(backend: str) -> tuple[bool, str | None]:
|
||||
|
||||
|
||||
def get_reranker(
|
||||
backend: str = "fastembed",
|
||||
backend: str = "onnx",
|
||||
model_name: str | None = None,
|
||||
*,
|
||||
device: str | None = None,
|
||||
@@ -76,18 +75,18 @@ def get_reranker(
|
||||
|
||||
Args:
|
||||
backend: Reranker backend to use. Options:
|
||||
- "fastembed": FastEmbed TextCrossEncoder backend (default, recommended)
|
||||
- "onnx": Redirects to fastembed for backward compatibility
|
||||
- "onnx": Optimum + ONNX Runtime backend (default)
|
||||
- "fastembed": FastEmbed TextCrossEncoder backend
|
||||
- "api": HTTP API backend (remote providers)
|
||||
- "litellm": LiteLLM backend (LLM-based, for API mode)
|
||||
- "legacy": sentence-transformers CrossEncoder backend (optional)
|
||||
model_name: Model identifier for model-based backends. Defaults depend on backend:
|
||||
- onnx: Xenova/ms-marco-MiniLM-L-6-v2
|
||||
- fastembed: Xenova/ms-marco-MiniLM-L-6-v2
|
||||
- onnx: (redirects to fastembed)
|
||||
- api: BAAI/bge-reranker-v2-m3 (SiliconFlow)
|
||||
- legacy: cross-encoder/ms-marco-MiniLM-L-6-v2
|
||||
- litellm: default
|
||||
device: Optional device string for backends that support it (legacy only).
|
||||
device: Optional device string for backends that support it (legacy and onnx).
|
||||
**kwargs: Additional backend-specific arguments.
|
||||
|
||||
Returns:
|
||||
@@ -111,16 +110,17 @@ def get_reranker(
|
||||
return FastEmbedReranker(model_name=resolved_model_name, **kwargs)
|
||||
|
||||
if backend == "onnx":
|
||||
# Redirect to fastembed for backward compatibility
|
||||
ok, err = check_reranker_available("fastembed")
|
||||
ok, err = check_reranker_available("onnx")
|
||||
if not ok:
|
||||
raise ImportError(err)
|
||||
|
||||
from .fastembed_reranker import FastEmbedReranker
|
||||
from .onnx_reranker import ONNXReranker
|
||||
|
||||
resolved_model_name = (model_name or "").strip() or FastEmbedReranker.DEFAULT_MODEL
|
||||
_ = device # Device selection is managed via fastembed providers.
|
||||
return FastEmbedReranker(model_name=resolved_model_name, **kwargs)
|
||||
resolved_model_name = (model_name or "").strip() or ONNXReranker.DEFAULT_MODEL
|
||||
effective_kwargs = dict(kwargs)
|
||||
if "use_gpu" not in effective_kwargs and device is not None:
|
||||
effective_kwargs["use_gpu"] = str(device).strip().lower() not in {"cpu", "none"}
|
||||
return ONNXReranker(model_name=resolved_model_name, **effective_kwargs)
|
||||
|
||||
if backend == "legacy":
|
||||
ok, err = check_reranker_available("legacy")
|
||||
|
||||
@@ -58,6 +58,38 @@ def _iter_batches(items: Sequence[Any], batch_size: int) -> Iterable[Sequence[An
|
||||
yield items[i : i + batch_size]
|
||||
|
||||
|
||||
def _normalize_provider_specs(
|
||||
providers: Sequence[Any] | None,
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
"""Split execution-provider specs into Optimum-compatible names and options."""
|
||||
normalized_providers: list[str] = []
|
||||
normalized_options: list[dict[str, Any]] = []
|
||||
|
||||
for provider in providers or ():
|
||||
provider_name: str | None = None
|
||||
provider_options: dict[str, Any] = {}
|
||||
|
||||
if isinstance(provider, tuple):
|
||||
if provider:
|
||||
provider_name = str(provider[0]).strip()
|
||||
if len(provider) > 1 and isinstance(provider[1], dict):
|
||||
provider_options = dict(provider[1])
|
||||
elif provider is not None:
|
||||
provider_name = str(provider).strip()
|
||||
|
||||
if not provider_name:
|
||||
continue
|
||||
|
||||
normalized_providers.append(provider_name)
|
||||
normalized_options.append(provider_options)
|
||||
|
||||
if not normalized_providers:
|
||||
normalized_providers.append("CPUExecutionProvider")
|
||||
normalized_options.append({})
|
||||
|
||||
return normalized_providers, normalized_options
|
||||
|
||||
|
||||
class ONNXReranker(BaseReranker):
|
||||
"""Cross-encoder reranker using Optimum + ONNX Runtime with lazy loading."""
|
||||
|
||||
@@ -110,19 +142,21 @@ class ONNXReranker(BaseReranker):
|
||||
use_gpu=self.use_gpu, with_device_options=True
|
||||
)
|
||||
|
||||
provider_names, provider_options = _normalize_provider_specs(self.providers)
|
||||
|
||||
# Some Optimum versions accept `providers`, others accept a single `provider`.
|
||||
# Prefer passing the full providers list, with a conservative fallback.
|
||||
model_kwargs: dict[str, Any] = {}
|
||||
try:
|
||||
params = signature(ORTModelForSequenceClassification.from_pretrained).parameters
|
||||
if "providers" in params:
|
||||
model_kwargs["providers"] = self.providers
|
||||
model_kwargs["providers"] = provider_names
|
||||
if "provider_options" in params:
|
||||
model_kwargs["provider_options"] = provider_options
|
||||
elif "provider" in params:
|
||||
provider_name = "CPUExecutionProvider"
|
||||
if self.providers:
|
||||
first = self.providers[0]
|
||||
provider_name = first[0] if isinstance(first, tuple) else str(first)
|
||||
model_kwargs["provider"] = provider_name
|
||||
model_kwargs["provider"] = provider_names[0]
|
||||
if "provider_options" in params and provider_options[0]:
|
||||
model_kwargs["provider_options"] = provider_options[0]
|
||||
except Exception:
|
||||
model_kwargs = {}
|
||||
|
||||
|
||||
47
codex-lens/src/codexlens/storage/index_filters.py
Normal file
47
codex-lens/src/codexlens/storage/index_filters.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Set
|
||||
|
||||
from codexlens.storage.index_tree import DEFAULT_IGNORE_DIRS
|
||||
|
||||
|
||||
EXTRA_IGNORED_INDEX_DIRS = frozenset({".workflow"})
|
||||
IGNORED_INDEX_DIRS = frozenset({name.casefold() for name in DEFAULT_IGNORE_DIRS | set(EXTRA_IGNORED_INDEX_DIRS)})
|
||||
|
||||
|
||||
def is_ignored_index_path(
|
||||
index_path: Path,
|
||||
scan_root: Path,
|
||||
*,
|
||||
ignored_dir_names: Optional[Set[str]] = None,
|
||||
) -> bool:
|
||||
"""Return True when an index lives under an ignored/generated subtree."""
|
||||
|
||||
ignored = (
|
||||
{name.casefold() for name in ignored_dir_names}
|
||||
if ignored_dir_names is not None
|
||||
else IGNORED_INDEX_DIRS
|
||||
)
|
||||
|
||||
try:
|
||||
relative_parts = index_path.resolve().relative_to(scan_root.resolve()).parts[:-1]
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return any(part.casefold() in ignored for part in relative_parts)
|
||||
|
||||
|
||||
def filter_index_paths(
|
||||
index_paths: Iterable[Path],
|
||||
scan_root: Path,
|
||||
*,
|
||||
ignored_dir_names: Optional[Set[str]] = None,
|
||||
) -> List[Path]:
|
||||
"""Filter out discovered indexes that belong to ignored/generated subtrees."""
|
||||
|
||||
return [
|
||||
path
|
||||
for path in index_paths
|
||||
if not is_ignored_index_path(path, scan_root, ignored_dir_names=ignored_dir_names)
|
||||
]
|
||||
@@ -252,6 +252,18 @@ class IndexTreeBuilder:
|
||||
# Collect directories by depth
|
||||
dirs_by_depth = self._collect_dirs_by_depth(source_root, languages)
|
||||
|
||||
if force_full:
|
||||
pruned_dirs = self._prune_stale_project_dirs(
|
||||
project_id=project_info.id,
|
||||
source_root=source_root,
|
||||
dirs_by_depth=dirs_by_depth,
|
||||
)
|
||||
if pruned_dirs:
|
||||
self.logger.info(
|
||||
"Pruned %d stale directory mappings before full rebuild",
|
||||
len(pruned_dirs),
|
||||
)
|
||||
|
||||
if not dirs_by_depth:
|
||||
self.logger.warning("No indexable directories found in %s", source_root)
|
||||
if global_index is not None:
|
||||
@@ -450,6 +462,52 @@ class IndexTreeBuilder:
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _prune_stale_project_dirs(
|
||||
self,
|
||||
*,
|
||||
project_id: int,
|
||||
source_root: Path,
|
||||
dirs_by_depth: Dict[int, List[Path]],
|
||||
) -> List[Path]:
|
||||
"""Remove registry mappings for directories no longer included in the index tree."""
|
||||
source_root = source_root.resolve()
|
||||
valid_dirs: Set[Path] = {
|
||||
path.resolve()
|
||||
for paths in dirs_by_depth.values()
|
||||
for path in paths
|
||||
}
|
||||
valid_dirs.add(source_root)
|
||||
|
||||
stale_mappings = []
|
||||
for mapping in self.registry.get_project_dirs(project_id):
|
||||
mapping_path = mapping.source_path.resolve()
|
||||
if mapping_path in valid_dirs:
|
||||
continue
|
||||
try:
|
||||
mapping_path.relative_to(source_root)
|
||||
except ValueError:
|
||||
continue
|
||||
stale_mappings.append(mapping)
|
||||
|
||||
stale_mappings.sort(
|
||||
key=lambda mapping: len(mapping.source_path.resolve().relative_to(source_root).parts),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
pruned_paths: List[Path] = []
|
||||
for mapping in stale_mappings:
|
||||
try:
|
||||
if self.registry.unregister_dir(mapping.source_path):
|
||||
pruned_paths.append(mapping.source_path.resolve())
|
||||
except Exception as exc:
|
||||
self.logger.warning(
|
||||
"Failed to prune stale mapping for %s: %s",
|
||||
mapping.source_path,
|
||||
exc,
|
||||
)
|
||||
|
||||
return pruned_paths
|
||||
|
||||
def _collect_dirs_by_depth(
|
||||
self, source_root: Path, languages: List[str] = None
|
||||
) -> Dict[int, List[Path]]:
|
||||
@@ -620,8 +678,9 @@ class IndexTreeBuilder:
|
||||
"static_graph_enabled": self.config.static_graph_enabled,
|
||||
"static_graph_relationship_types": self.config.static_graph_relationship_types,
|
||||
"use_astgrep": getattr(self.config, "use_astgrep", False),
|
||||
"ignore_patterns": list(getattr(self.config, "ignore_patterns", [])),
|
||||
"extension_filters": list(getattr(self.config, "extension_filters", [])),
|
||||
"ignore_patterns": list(self.ignore_patterns),
|
||||
"extension_filters": list(self.extension_filters),
|
||||
"incremental": bool(self.incremental),
|
||||
}
|
||||
|
||||
worker_args = [
|
||||
@@ -693,6 +752,9 @@ class IndexTreeBuilder:
|
||||
# Ensure index directory exists
|
||||
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.incremental:
|
||||
_reset_index_db_files(index_db_path)
|
||||
|
||||
# Create directory index
|
||||
if self.config.global_symbol_index_enabled:
|
||||
global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_id)
|
||||
@@ -1100,6 +1162,18 @@ def _matches_extension_filters(path: Path, patterns: List[str], source_root: Opt
|
||||
return _matches_path_patterns(path, patterns, source_root)
|
||||
|
||||
|
||||
def _reset_index_db_files(index_db_path: Path) -> None:
|
||||
"""Best-effort removal of a directory index DB and common SQLite sidecars."""
|
||||
for suffix in ("", "-wal", "-shm", "-journal"):
|
||||
target = Path(f"{index_db_path}{suffix}") if suffix else index_db_path
|
||||
try:
|
||||
target.unlink()
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
|
||||
def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
"""Worker function for parallel directory building.
|
||||
|
||||
@@ -1140,6 +1214,9 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
global_index = GlobalSymbolIndex(Path(global_index_db_path), project_id=int(project_id))
|
||||
global_index.initialize()
|
||||
|
||||
if not bool(config_dict.get("incremental", True)):
|
||||
_reset_index_db_files(index_db_path)
|
||||
|
||||
store = DirIndexStore(index_db_path, config=config, global_index=global_index)
|
||||
store.initialize()
|
||||
|
||||
|
||||
@@ -591,6 +591,56 @@ class RegistryStore:
|
||||
|
||||
return [self._row_to_dir_mapping(row) for row in rows]
|
||||
|
||||
def find_descendant_project_roots(self, source_root: Path) -> List[DirMapping]:
|
||||
"""Return root directory mappings for nested projects under ``source_root``."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_resolved = source_root.resolve()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root_resolved)
|
||||
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT dm.*
|
||||
FROM dir_mapping dm
|
||||
INNER JOIN projects p ON p.id = dm.project_id
|
||||
WHERE dm.source_path = p.source_root
|
||||
AND p.source_root LIKE ?
|
||||
ORDER BY p.source_root ASC
|
||||
""",
|
||||
(f"{source_root_str}%",),
|
||||
).fetchall()
|
||||
|
||||
descendant_roots: List[DirMapping] = []
|
||||
normalized_root_path = Path(source_root_str)
|
||||
|
||||
for row in rows:
|
||||
mapping = self._row_to_dir_mapping(row)
|
||||
normalized_mapping_path = Path(
|
||||
self._normalize_path_for_comparison(mapping.source_path.resolve())
|
||||
)
|
||||
|
||||
if normalized_mapping_path == normalized_root_path:
|
||||
continue
|
||||
|
||||
try:
|
||||
normalized_mapping_path.relative_to(normalized_root_path)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
descendant_roots.append(mapping)
|
||||
|
||||
descendant_roots.sort(
|
||||
key=lambda mapping: (
|
||||
len(
|
||||
mapping.source_path.resolve().relative_to(
|
||||
source_root_resolved
|
||||
).parts
|
||||
),
|
||||
self._normalize_path_for_comparison(mapping.source_path.resolve()),
|
||||
)
|
||||
)
|
||||
return descendant_roots
|
||||
|
||||
def update_dir_stats(self, source_path: Path, files_count: int) -> None:
|
||||
"""Update directory statistics.
|
||||
|
||||
|
||||
@@ -11,12 +11,25 @@ Common Fixtures:
|
||||
- sample_code_files: Factory for creating sample code files
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
import sqlite3
|
||||
import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=r"'BaseCommand' is deprecated and will be removed in Click 9\.0\..*",
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=r"The '__version__' attribute is deprecated and will be removed in Click 9\.1\..*",
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -98,6 +98,23 @@ class TestANNIndex:
|
||||
assert ids[0] == 1 # ID of first vector
|
||||
assert distances[0] < 0.01 # Very small distance (almost identical)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not _hnswlib_available(),
|
||||
reason="hnswlib not installed"
|
||||
)
|
||||
def test_search_clamps_top_k_to_available_vectors(self, temp_db, sample_vectors, sample_ids):
|
||||
"""Search should clamp top_k to the loaded vector count."""
|
||||
from codexlens.semantic.ann_index import ANNIndex
|
||||
|
||||
index = ANNIndex(temp_db, dim=384)
|
||||
index.add_vectors(sample_ids[:3], sample_vectors[:3])
|
||||
|
||||
ids, distances = index.search(sample_vectors[0], top_k=10)
|
||||
|
||||
assert len(ids) == 3
|
||||
assert len(distances) == 3
|
||||
assert ids[0] == 1
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not _hnswlib_available(),
|
||||
reason="hnswlib not installed"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
350
codex-lens/tests/test_compare_ccw_smart_search_stage2.py
Normal file
350
codex-lens/tests/test_compare_ccw_smart_search_stage2.py
Normal file
@@ -0,0 +1,350 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
|
||||
MODULE_PATH = Path(__file__).resolve().parents[1] / "benchmarks" / "compare_ccw_smart_search_stage2.py"
|
||||
MODULE_NAME = "compare_ccw_smart_search_stage2_test_module"
|
||||
MODULE_SPEC = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
|
||||
assert MODULE_SPEC is not None and MODULE_SPEC.loader is not None
|
||||
benchmark = importlib.util.module_from_spec(MODULE_SPEC)
|
||||
sys.modules[MODULE_NAME] = benchmark
|
||||
MODULE_SPEC.loader.exec_module(benchmark)
|
||||
|
||||
|
||||
class _FakeChainResult:
|
||||
def __init__(self, paths: list[str]) -> None:
|
||||
self.results = [SimpleNamespace(path=path) for path in paths]
|
||||
|
||||
|
||||
class _FakeEngine:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
search_paths: list[str] | None = None,
|
||||
cascade_paths: list[str] | None = None,
|
||||
) -> None:
|
||||
self.search_paths = search_paths or []
|
||||
self.cascade_paths = cascade_paths or []
|
||||
self.search_calls: list[dict[str, object]] = []
|
||||
self.cascade_calls: list[dict[str, object]] = []
|
||||
|
||||
def search(self, query: str, source_path: Path, options: object) -> _FakeChainResult:
|
||||
self.search_calls.append(
|
||||
{
|
||||
"query": query,
|
||||
"source_path": source_path,
|
||||
"options": options,
|
||||
}
|
||||
)
|
||||
return _FakeChainResult(self.search_paths)
|
||||
|
||||
def cascade_search(
|
||||
self,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
*,
|
||||
k: int,
|
||||
coarse_k: int,
|
||||
options: object,
|
||||
strategy: str,
|
||||
) -> _FakeChainResult:
|
||||
self.cascade_calls.append(
|
||||
{
|
||||
"query": query,
|
||||
"source_path": source_path,
|
||||
"k": k,
|
||||
"coarse_k": coarse_k,
|
||||
"options": options,
|
||||
"strategy": strategy,
|
||||
}
|
||||
)
|
||||
return _FakeChainResult(self.cascade_paths)
|
||||
|
||||
|
||||
def test_strategy_specs_include_baselines_before_stage2_modes() -> None:
|
||||
specs = benchmark._strategy_specs(
|
||||
["realtime", "static_global_graph"],
|
||||
include_dense_baseline=True,
|
||||
baseline_methods=["auto", "fts", "hybrid"],
|
||||
)
|
||||
|
||||
assert [spec.strategy_key for spec in specs] == [
|
||||
"auto",
|
||||
"fts",
|
||||
"hybrid",
|
||||
"dense_rerank",
|
||||
"staged:realtime",
|
||||
"staged:static_global_graph",
|
||||
]
|
||||
|
||||
|
||||
def test_select_effective_method_matches_cli_auto_routing() -> None:
|
||||
assert benchmark._select_effective_method("find_descendant_project_roots", "auto") == "fts"
|
||||
assert benchmark._select_effective_method("build dist artifact output", "auto") == "fts"
|
||||
assert benchmark._select_effective_method("embedding backend fastembed local litellm api config", "auto") == "fts"
|
||||
assert benchmark._select_effective_method("get_reranker factory onnx backend selection", "auto") == "fts"
|
||||
assert benchmark._select_effective_method("how does the authentication flow work", "auto") == "dense_rerank"
|
||||
assert benchmark._select_effective_method("how smart_search keyword routing works", "auto") == "hybrid"
|
||||
|
||||
|
||||
def test_filter_dataset_by_query_match_uses_case_insensitive_substring() -> None:
|
||||
dataset = [
|
||||
{"query": "embedding backend fastembed local litellm api config", "relevant_paths": ["a"]},
|
||||
{"query": "get_reranker factory onnx backend selection", "relevant_paths": ["b"]},
|
||||
{"query": "how does smart search route keyword queries", "relevant_paths": ["c"]},
|
||||
]
|
||||
|
||||
filtered = benchmark._filter_dataset_by_query_match(dataset, "BACKEND")
|
||||
assert [item["query"] for item in filtered] == [
|
||||
"embedding backend fastembed local litellm api config",
|
||||
"get_reranker factory onnx backend selection",
|
||||
]
|
||||
|
||||
narrow_filtered = benchmark._filter_dataset_by_query_match(dataset, "FASTEMBED")
|
||||
assert [item["query"] for item in narrow_filtered] == [
|
||||
"embedding backend fastembed local litellm api config",
|
||||
]
|
||||
|
||||
unfiltered = benchmark._filter_dataset_by_query_match(dataset, None)
|
||||
assert [item["query"] for item in unfiltered] == [item["query"] for item in dataset]
|
||||
|
||||
|
||||
def test_apply_query_limit_runs_after_filtering() -> None:
|
||||
dataset = [
|
||||
{"query": "executeHybridMode dense_rerank semantic smart_search", "relevant_paths": ["a"]},
|
||||
{"query": "embedding backend fastembed local litellm api config", "relevant_paths": ["b"]},
|
||||
{"query": "reranker backend onnx api legacy configuration", "relevant_paths": ["c"]},
|
||||
]
|
||||
|
||||
filtered = benchmark._filter_dataset_by_query_match(dataset, "backend")
|
||||
limited = benchmark._apply_query_limit(filtered, 1)
|
||||
|
||||
assert [item["query"] for item in limited] == [
|
||||
"embedding backend fastembed local litellm api config",
|
||||
]
|
||||
|
||||
|
||||
def test_make_progress_payload_reports_partial_completion() -> None:
|
||||
args = SimpleNamespace(
|
||||
queries_file=Path("queries.jsonl"),
|
||||
k=10,
|
||||
coarse_k=100,
|
||||
)
|
||||
strategy_specs = [
|
||||
benchmark.StrategySpec(strategy_key="auto", strategy="auto", stage2_mode=None),
|
||||
benchmark.StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None),
|
||||
]
|
||||
evaluations = [
|
||||
benchmark.QueryEvaluation(
|
||||
query="embedding backend fastembed local litellm api config",
|
||||
intent="config",
|
||||
notes=None,
|
||||
relevant_paths=["codex-lens/src/codexlens/config.py"],
|
||||
runs={
|
||||
"auto": benchmark.StrategyRun(
|
||||
strategy_key="auto",
|
||||
strategy="auto",
|
||||
stage2_mode=None,
|
||||
effective_method="fts",
|
||||
execution_method="fts",
|
||||
latency_ms=123.0,
|
||||
topk_paths=["config.py"],
|
||||
first_hit_rank=1,
|
||||
hit_at_k=True,
|
||||
recall_at_k=1.0,
|
||||
generated_artifact_count=0,
|
||||
test_file_count=0,
|
||||
error=None,
|
||||
)
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
payload = benchmark._make_progress_payload(
|
||||
args=args,
|
||||
source_root=Path("D:/repo"),
|
||||
strategy_specs=strategy_specs,
|
||||
evaluations=evaluations,
|
||||
query_index=1,
|
||||
total_queries=3,
|
||||
run_index=2,
|
||||
total_runs=6,
|
||||
current_query="embedding backend fastembed local litellm api config",
|
||||
current_strategy_key="complete",
|
||||
)
|
||||
|
||||
assert payload["status"] == "running"
|
||||
assert payload["progress"]["completed_queries"] == 1
|
||||
assert payload["progress"]["completed_runs"] == 2
|
||||
assert payload["progress"]["total_runs"] == 6
|
||||
assert payload["strategy_keys"] == ["auto", "dense_rerank"]
|
||||
assert payload["evaluations"][0]["runs"]["auto"]["effective_method"] == "fts"
|
||||
|
||||
|
||||
def test_write_final_outputs_updates_progress_snapshot(tmp_path: Path) -> None:
|
||||
output_path = tmp_path / "results.json"
|
||||
progress_path = tmp_path / "progress.json"
|
||||
payload = {
|
||||
"status": "completed",
|
||||
"query_count": 1,
|
||||
"strategies": {"auto": {"effective_methods": {"fts": 1}}},
|
||||
}
|
||||
|
||||
benchmark._write_final_outputs(
|
||||
output_path=output_path,
|
||||
progress_output=progress_path,
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
assert json.loads(output_path.read_text(encoding="utf-8")) == payload
|
||||
assert json.loads(progress_path.read_text(encoding="utf-8")) == payload
|
||||
|
||||
|
||||
def test_build_parser_defaults_reranker_gpu_to_disabled() -> None:
|
||||
parser = benchmark.build_parser()
|
||||
args = parser.parse_args([])
|
||||
|
||||
assert args.embedding_use_gpu is False
|
||||
assert args.reranker_use_gpu is False
|
||||
assert args.reranker_model == benchmark.DEFAULT_LOCAL_ONNX_RERANKER_MODEL
|
||||
|
||||
|
||||
def test_build_strategy_runtime_clones_config(monkeypatch, tmp_path: Path) -> None:
|
||||
class _FakeRegistry:
|
||||
def __init__(self) -> None:
|
||||
self.initialized = False
|
||||
|
||||
def initialize(self) -> None:
|
||||
self.initialized = True
|
||||
|
||||
class _FakeMapper:
|
||||
pass
|
||||
|
||||
class _FakeEngine:
|
||||
def __init__(self, *, registry, mapper, config) -> None:
|
||||
self.registry = registry
|
||||
self.mapper = mapper
|
||||
self.config = config
|
||||
|
||||
monkeypatch.setattr(benchmark, "RegistryStore", _FakeRegistry)
|
||||
monkeypatch.setattr(benchmark, "PathMapper", _FakeMapper)
|
||||
monkeypatch.setattr(benchmark, "ChainSearchEngine", _FakeEngine)
|
||||
|
||||
base_config = benchmark.Config(data_dir=tmp_path, reranker_use_gpu=False)
|
||||
strategy_spec = benchmark.StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None)
|
||||
|
||||
runtime = benchmark._build_strategy_runtime(base_config, strategy_spec)
|
||||
|
||||
assert runtime.strategy_spec == strategy_spec
|
||||
assert runtime.config is not base_config
|
||||
assert runtime.config.reranker_use_gpu is False
|
||||
assert runtime.registry.initialized is True
|
||||
assert runtime.engine.config is runtime.config
|
||||
|
||||
|
||||
def test_run_strategy_routes_auto_keyword_queries_to_fts_search() -> None:
|
||||
engine = _FakeEngine(
|
||||
search_paths=[
|
||||
"D:/repo/src/codexlens/storage/registry.py",
|
||||
"D:/repo/build/lib/codexlens/storage/registry.py",
|
||||
]
|
||||
)
|
||||
config = SimpleNamespace(cascade_strategy="staged", staged_stage2_mode="realtime")
|
||||
relevant = {benchmark._normalize_path_key("D:/repo/src/codexlens/storage/registry.py")}
|
||||
|
||||
run = benchmark._run_strategy(
|
||||
engine,
|
||||
config,
|
||||
strategy_spec=benchmark.StrategySpec(strategy_key="auto", strategy="auto", stage2_mode=None),
|
||||
query="find_descendant_project_roots",
|
||||
source_path=Path("D:/repo"),
|
||||
k=5,
|
||||
coarse_k=20,
|
||||
relevant=relevant,
|
||||
)
|
||||
|
||||
assert len(engine.search_calls) == 1
|
||||
assert len(engine.cascade_calls) == 0
|
||||
assert run.effective_method == "fts"
|
||||
assert run.execution_method == "fts"
|
||||
assert run.hit_at_k is True
|
||||
assert run.generated_artifact_count == 1
|
||||
assert run.test_file_count == 0
|
||||
|
||||
|
||||
def test_run_strategy_uses_cascade_for_dense_rerank_and_restores_config() -> None:
|
||||
engine = _FakeEngine(cascade_paths=["D:/repo/src/tools/smart-search.ts"])
|
||||
config = SimpleNamespace(cascade_strategy="staged", staged_stage2_mode="static_global_graph")
|
||||
relevant = {benchmark._normalize_path_key("D:/repo/src/tools/smart-search.ts")}
|
||||
|
||||
run = benchmark._run_strategy(
|
||||
engine,
|
||||
config,
|
||||
strategy_spec=benchmark.StrategySpec(
|
||||
strategy_key="dense_rerank",
|
||||
strategy="dense_rerank",
|
||||
stage2_mode=None,
|
||||
),
|
||||
query="how does smart search route keyword queries",
|
||||
source_path=Path("D:/repo"),
|
||||
k=5,
|
||||
coarse_k=20,
|
||||
relevant=relevant,
|
||||
)
|
||||
|
||||
assert len(engine.search_calls) == 0
|
||||
assert len(engine.cascade_calls) == 1
|
||||
assert engine.cascade_calls[0]["strategy"] == "dense_rerank"
|
||||
assert run.effective_method == "dense_rerank"
|
||||
assert run.execution_method == "cascade"
|
||||
assert run.hit_at_k is True
|
||||
assert config.cascade_strategy == "staged"
|
||||
assert config.staged_stage2_mode == "static_global_graph"
|
||||
|
||||
|
||||
def test_summarize_runs_tracks_effective_method_and_artifact_pressure() -> None:
|
||||
summary = benchmark._summarize_runs(
|
||||
[
|
||||
benchmark.StrategyRun(
|
||||
strategy_key="auto",
|
||||
strategy="auto",
|
||||
stage2_mode=None,
|
||||
effective_method="fts",
|
||||
execution_method="fts",
|
||||
latency_ms=10.0,
|
||||
topk_paths=["a"],
|
||||
first_hit_rank=1,
|
||||
hit_at_k=True,
|
||||
recall_at_k=1.0,
|
||||
generated_artifact_count=1,
|
||||
test_file_count=0,
|
||||
error=None,
|
||||
),
|
||||
benchmark.StrategyRun(
|
||||
strategy_key="auto",
|
||||
strategy="auto",
|
||||
stage2_mode=None,
|
||||
effective_method="hybrid",
|
||||
execution_method="hybrid",
|
||||
latency_ms=30.0,
|
||||
topk_paths=["b"],
|
||||
first_hit_rank=None,
|
||||
hit_at_k=False,
|
||||
recall_at_k=0.0,
|
||||
generated_artifact_count=0,
|
||||
test_file_count=2,
|
||||
error=None,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert summary["effective_methods"] == {"fts": 1, "hybrid": 1}
|
||||
assert summary["runs_with_generated_artifacts"] == 1
|
||||
assert summary["runs_with_test_files"] == 1
|
||||
assert summary["avg_generated_artifact_count"] == 0.5
|
||||
assert summary["avg_test_file_count"] == 1.0
|
||||
83
codex-lens/tests/test_config_search_env_overrides.py
Normal file
83
codex-lens/tests/test_config_search_env_overrides.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Unit tests for Config .env overrides for final search ranking penalties."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from codexlens.config import Config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_config_dir() -> Path:
|
||||
"""Create temporary directory for config data_dir."""
|
||||
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
||||
yield Path(tmpdir.name)
|
||||
try:
|
||||
tmpdir.cleanup()
|
||||
except (PermissionError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
def test_search_penalty_env_overrides_apply(temp_config_dir: Path) -> None:
|
||||
config = Config(data_dir=temp_config_dir)
|
||||
|
||||
env_path = temp_config_dir / ".env"
|
||||
env_path.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"TEST_FILE_PENALTY=0.25",
|
||||
"GENERATED_FILE_PENALTY=0.4",
|
||||
"",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config.load_settings()
|
||||
|
||||
assert config.test_file_penalty == 0.25
|
||||
assert config.generated_file_penalty == 0.4
|
||||
|
||||
|
||||
def test_reranker_gpu_env_override_apply(temp_config_dir: Path) -> None:
|
||||
config = Config(data_dir=temp_config_dir)
|
||||
|
||||
env_path = temp_config_dir / ".env"
|
||||
env_path.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"RERANKER_USE_GPU=false",
|
||||
"",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config.load_settings()
|
||||
|
||||
assert config.reranker_use_gpu is False
|
||||
|
||||
|
||||
def test_search_penalty_env_overrides_invalid_ignored(temp_config_dir: Path) -> None:
|
||||
config = Config(data_dir=temp_config_dir)
|
||||
|
||||
env_path = temp_config_dir / ".env"
|
||||
env_path.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"TEST_FILE_PENALTY=oops",
|
||||
"GENERATED_FILE_PENALTY=nope",
|
||||
"",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config.load_settings()
|
||||
|
||||
assert config.test_file_penalty == 0.15
|
||||
assert config.generated_file_penalty == 0.35
|
||||
assert config.reranker_use_gpu is True
|
||||
204
codex-lens/tests/test_embedding_status_root_model.py
Normal file
204
codex-lens/tests/test_embedding_status_root_model.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import gc
|
||||
import gc
|
||||
import shutil
|
||||
import sqlite3
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import codexlens.cli.embedding_manager as embedding_manager
|
||||
from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def status_temp_dir() -> Path:
|
||||
temp_path = Path(tempfile.mkdtemp())
|
||||
try:
|
||||
yield temp_path
|
||||
finally:
|
||||
gc.collect()
|
||||
for _ in range(5):
|
||||
try:
|
||||
if temp_path.exists():
|
||||
shutil.rmtree(temp_path)
|
||||
break
|
||||
except PermissionError:
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def _create_index_db(index_path: Path, files: list[str], embedded_files: list[str] | None = None) -> None:
|
||||
index_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
content TEXT,
|
||||
language TEXT,
|
||||
hash TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
cursor.executemany(
|
||||
"INSERT INTO files (path, content, language, hash) VALUES (?, ?, ?, ?)",
|
||||
[(file_path, "", "python", f"hash-{idx}") for idx, file_path in enumerate(files)],
|
||||
)
|
||||
|
||||
if embedded_files is not None:
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE semantic_chunks (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT,
|
||||
embedding BLOB,
|
||||
metadata TEXT,
|
||||
category TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
cursor.executemany(
|
||||
"INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) VALUES (?, ?, ?, ?, ?)",
|
||||
[(file_path, "chunk", b"vec", "{}", "code") for file_path in embedded_files],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _create_vectors_meta_db(meta_path: Path, embedded_files: list[str], binary_vector_count: int = 0) -> None:
|
||||
meta_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with sqlite3.connect(meta_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE chunk_metadata (
|
||||
chunk_id INTEGER PRIMARY KEY,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
category TEXT,
|
||||
metadata TEXT,
|
||||
source_index_db TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE binary_vectors (
|
||||
chunk_id INTEGER PRIMARY KEY,
|
||||
vector BLOB NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
cursor.executemany(
|
||||
"""
|
||||
INSERT INTO chunk_metadata (
|
||||
chunk_id, file_path, content, start_line, end_line, category, metadata, source_index_db
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(idx, file_path, "chunk", 1, 1, "code", "{}", str(meta_path.parent / "_index.db"))
|
||||
for idx, file_path in enumerate(embedded_files, start=1)
|
||||
],
|
||||
)
|
||||
cursor.executemany(
|
||||
"INSERT INTO binary_vectors (chunk_id, vector) VALUES (?, ?)",
|
||||
[(idx, b"\x01") for idx in range(1, binary_vector_count + 1)],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def test_root_status_does_not_inherit_child_embeddings(
|
||||
monkeypatch: pytest.MonkeyPatch, status_temp_dir: Path
|
||||
) -> None:
|
||||
workspace = status_temp_dir / "workspace"
|
||||
workspace.mkdir()
|
||||
_create_index_db(workspace / "_index.db", ["a.py", "b.py"])
|
||||
_create_index_db(workspace / "child" / "_index.db", ["child.py"], embedded_files=["child.py"])
|
||||
|
||||
monkeypatch.setattr(
|
||||
embedding_manager,
|
||||
"_get_model_info_from_index",
|
||||
lambda index_path: {
|
||||
"model_profile": "fast",
|
||||
"model_name": "unit-test-model",
|
||||
"embedding_dim": 384,
|
||||
"backend": "fastembed",
|
||||
"created_at": "2026-03-13T00:00:00Z",
|
||||
"updated_at": "2026-03-13T00:00:00Z",
|
||||
} if index_path.parent.name == "child" else None,
|
||||
)
|
||||
|
||||
status = get_embeddings_status(workspace)
|
||||
assert status["success"] is True
|
||||
|
||||
result = status["result"]
|
||||
assert result["coverage_percent"] == 0.0
|
||||
assert result["files_with_embeddings"] == 0
|
||||
assert result["root"]["has_embeddings"] is False
|
||||
assert result["model_info"] is None
|
||||
assert result["subtree"]["indexes_with_embeddings"] == 1
|
||||
assert result["subtree"]["coverage_percent"] > 0
|
||||
|
||||
|
||||
def test_root_status_uses_validated_centralized_metadata(status_temp_dir: Path) -> None:
|
||||
workspace = status_temp_dir / "workspace"
|
||||
workspace.mkdir()
|
||||
_create_index_db(workspace / "_index.db", ["a.py", "b.py"])
|
||||
_create_vectors_meta_db(workspace / "_vectors_meta.db", ["a.py"])
|
||||
(workspace / "_vectors.hnsw").write_bytes(b"hnsw")
|
||||
|
||||
status = get_embeddings_status(workspace)
|
||||
assert status["success"] is True
|
||||
|
||||
result = status["result"]
|
||||
assert result["coverage_percent"] == 50.0
|
||||
assert result["files_with_embeddings"] == 1
|
||||
assert result["total_chunks"] == 1
|
||||
assert result["root"]["has_embeddings"] is True
|
||||
assert result["root"]["storage_mode"] == "centralized"
|
||||
assert result["centralized"]["dense_ready"] is True
|
||||
assert result["centralized"]["usable"] is True
|
||||
|
||||
|
||||
def test_embedding_stats_summary_skips_ignored_artifact_indexes(status_temp_dir: Path) -> None:
|
||||
workspace = status_temp_dir / "workspace"
|
||||
workspace.mkdir()
|
||||
_create_index_db(workspace / "_index.db", ["root.py"])
|
||||
_create_index_db(workspace / "src" / "_index.db", ["src.py"])
|
||||
_create_index_db(workspace / "dist" / "_index.db", ["bundle.py"], embedded_files=["bundle.py"])
|
||||
_create_index_db(workspace / ".workflow" / "_index.db", ["trace.py"], embedded_files=["trace.py"])
|
||||
|
||||
summary = get_embedding_stats_summary(workspace)
|
||||
|
||||
assert summary["success"] is True
|
||||
result = summary["result"]
|
||||
assert result["total_indexes"] == 2
|
||||
assert {Path(item["path"]).relative_to(workspace).as_posix() for item in result["indexes"]} == {
|
||||
"_index.db",
|
||||
"src/_index.db",
|
||||
}
|
||||
|
||||
|
||||
def test_root_status_ignores_empty_centralized_artifacts(status_temp_dir: Path) -> None:
|
||||
workspace = status_temp_dir / "workspace"
|
||||
workspace.mkdir()
|
||||
_create_index_db(workspace / "_index.db", ["a.py", "b.py"])
|
||||
_create_vectors_meta_db(workspace / "_vectors_meta.db", [])
|
||||
(workspace / "_vectors.hnsw").write_bytes(b"hnsw")
|
||||
(workspace / "_binary_vectors.mmap").write_bytes(b"mmap")
|
||||
|
||||
status = get_embeddings_status(workspace)
|
||||
assert status["success"] is True
|
||||
|
||||
result = status["result"]
|
||||
assert result["coverage_percent"] == 0.0
|
||||
assert result["files_with_embeddings"] == 0
|
||||
assert result["root"]["has_embeddings"] is False
|
||||
assert result["centralized"]["chunk_metadata_rows"] == 0
|
||||
assert result["centralized"]["binary_vector_rows"] == 0
|
||||
assert result["centralized"]["usable"] is False
|
||||
@@ -833,6 +833,36 @@ class TestHybridSearchAdaptiveWeights:
|
||||
|
||||
assert captured["weights"]["vector"] > 0.6
|
||||
|
||||
def test_default_engine_weights_keep_lsp_graph_backend_available(self):
|
||||
"""Legacy public defaults should not discard LSP graph fusion weights internally."""
|
||||
from unittest.mock import patch
|
||||
|
||||
engine = HybridSearchEngine()
|
||||
|
||||
results_map = {
|
||||
"exact": [SearchResult(path="a.py", score=10.0, excerpt="a")],
|
||||
"fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")],
|
||||
"vector": [SearchResult(path="c.py", score=0.9, excerpt="c")],
|
||||
"lsp_graph": [SearchResult(path="d.py", score=0.8, excerpt="d")],
|
||||
}
|
||||
|
||||
captured = {}
|
||||
from codexlens.search import ranking as ranking_module
|
||||
|
||||
def capture_rrf(map_in, weights_in, k=60):
|
||||
captured["weights"] = dict(weights_in)
|
||||
return ranking_module.reciprocal_rank_fusion(map_in, weights_in, k=k)
|
||||
|
||||
with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch(
|
||||
"codexlens.search.hybrid_search.reciprocal_rank_fusion",
|
||||
side_effect=capture_rrf,
|
||||
):
|
||||
engine.search(Path("dummy.db"), "auth flow", enable_vector=True, enable_lsp_graph=True)
|
||||
|
||||
assert engine.weights == HybridSearchEngine.DEFAULT_WEIGHTS
|
||||
assert "lsp_graph" in captured["weights"]
|
||||
assert captured["weights"]["lsp_graph"] > 0.0
|
||||
|
||||
def test_reranking_enabled(self, tmp_path):
|
||||
"""Reranking runs only when explicitly enabled via config."""
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -93,7 +93,8 @@ def test_get_cross_encoder_reranker_uses_factory_backend_onnx_gpu_flag(
|
||||
enable_reranking=True,
|
||||
enable_cross_encoder_rerank=True,
|
||||
reranker_backend="onnx",
|
||||
embedding_use_gpu=False,
|
||||
embedding_use_gpu=True,
|
||||
reranker_use_gpu=False,
|
||||
)
|
||||
engine = HybridSearchEngine(config=config)
|
||||
|
||||
@@ -109,6 +110,58 @@ def test_get_cross_encoder_reranker_uses_factory_backend_onnx_gpu_flag(
|
||||
assert get_args["kwargs"]["use_gpu"] is False
|
||||
|
||||
|
||||
def test_get_cross_encoder_reranker_uses_cpu_device_for_legacy_when_reranker_gpu_disabled(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
calls: dict[str, object] = {}
|
||||
|
||||
def fake_check_reranker_available(backend: str):
|
||||
calls["check_backend"] = backend
|
||||
return True, None
|
||||
|
||||
sentinel = object()
|
||||
|
||||
def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs):
|
||||
calls["get_args"] = {
|
||||
"backend": backend,
|
||||
"model_name": model_name,
|
||||
"device": device,
|
||||
"kwargs": kwargs,
|
||||
}
|
||||
return sentinel
|
||||
|
||||
monkeypatch.setattr(
|
||||
"codexlens.semantic.reranker.check_reranker_available",
|
||||
fake_check_reranker_available,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"codexlens.semantic.reranker.get_reranker",
|
||||
fake_get_reranker,
|
||||
)
|
||||
|
||||
config = Config(
|
||||
data_dir=tmp_path / "legacy-cpu",
|
||||
enable_reranking=True,
|
||||
enable_cross_encoder_rerank=True,
|
||||
reranker_backend="legacy",
|
||||
reranker_model="dummy-model",
|
||||
embedding_use_gpu=True,
|
||||
reranker_use_gpu=False,
|
||||
)
|
||||
engine = HybridSearchEngine(config=config)
|
||||
|
||||
reranker = engine._get_cross_encoder_reranker()
|
||||
assert reranker is sentinel
|
||||
assert calls["check_backend"] == "legacy"
|
||||
|
||||
get_args = calls["get_args"]
|
||||
assert isinstance(get_args, dict)
|
||||
assert get_args["backend"] == "legacy"
|
||||
assert get_args["model_name"] == "dummy-model"
|
||||
assert get_args["device"] == "cpu"
|
||||
|
||||
|
||||
def test_get_cross_encoder_reranker_returns_none_when_backend_unavailable(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tmp_path,
|
||||
|
||||
@@ -150,6 +150,30 @@ class TestHybridSearchBackends:
|
||||
assert "exact" in backends
|
||||
assert "vector" in backends
|
||||
|
||||
def test_search_lexical_priority_query_skips_vector_backend(self, temp_paths, mock_config):
|
||||
"""Config/env/factory queries should stay lexical-first in hybrid mode."""
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
index_path = temp_paths / "_index.db"
|
||||
|
||||
with patch.object(engine, "_search_parallel") as mock_parallel:
|
||||
mock_parallel.return_value = {
|
||||
"exact": [SearchResult(path="config.py", score=10.0, excerpt="exact")],
|
||||
"fuzzy": [SearchResult(path="env_config.py", score=8.0, excerpt="fuzzy")],
|
||||
}
|
||||
|
||||
results = engine.search(
|
||||
index_path,
|
||||
"embedding backend fastembed local litellm api config",
|
||||
enable_fuzzy=True,
|
||||
enable_vector=True,
|
||||
)
|
||||
|
||||
assert len(results) >= 1
|
||||
backends = mock_parallel.call_args[0][2]
|
||||
assert "exact" in backends
|
||||
assert "fuzzy" in backends
|
||||
assert "vector" not in backends
|
||||
|
||||
def test_search_pure_vector(self, temp_paths, mock_config):
|
||||
"""Pure vector mode should only use vector backend."""
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
@@ -257,6 +281,39 @@ class TestHybridSearchFusion:
|
||||
|
||||
mock_rerank.assert_called_once()
|
||||
|
||||
def test_search_lexical_priority_query_skips_expensive_reranking(self, temp_paths, mock_config):
|
||||
"""Lexical-priority queries should bypass embedder and cross-encoder reranking."""
|
||||
mock_config.enable_reranking = True
|
||||
mock_config.enable_cross_encoder_rerank = True
|
||||
mock_config.reranking_top_k = 50
|
||||
mock_config.reranker_top_k = 20
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
index_path = temp_paths / "_index.db"
|
||||
|
||||
with patch.object(engine, "_search_parallel") as mock_parallel:
|
||||
mock_parallel.return_value = {
|
||||
"exact": [SearchResult(path="config.py", score=10.0, excerpt="code")],
|
||||
"fuzzy": [SearchResult(path="env_config.py", score=9.0, excerpt="env vars")],
|
||||
}
|
||||
|
||||
with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank, patch(
|
||||
"codexlens.search.hybrid_search.cross_encoder_rerank"
|
||||
) as mock_cross_encoder, patch.object(
|
||||
engine,
|
||||
"_get_cross_encoder_reranker",
|
||||
) as mock_get_reranker:
|
||||
results = engine.search(
|
||||
index_path,
|
||||
"get_reranker factory onnx backend selection",
|
||||
enable_fuzzy=True,
|
||||
enable_vector=True,
|
||||
)
|
||||
|
||||
assert len(results) >= 1
|
||||
mock_rerank.assert_not_called()
|
||||
mock_cross_encoder.assert_not_called()
|
||||
mock_get_reranker.assert_not_called()
|
||||
|
||||
def test_search_category_filtering(self, temp_paths, mock_config):
|
||||
"""Category filtering should separate code/doc results by intent."""
|
||||
mock_config.enable_category_filter = True
|
||||
@@ -316,6 +373,217 @@ class TestSearchParallel:
|
||||
mock_fuzzy.assert_called_once()
|
||||
|
||||
|
||||
class TestCentralizedMetadataFetch:
|
||||
"""Tests for centralized metadata retrieval helpers."""
|
||||
|
||||
def test_fetch_from_vector_meta_store_clamps_negative_scores(self, temp_paths, mock_config, monkeypatch):
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
|
||||
class FakeMetaStore:
|
||||
def __init__(self, _path):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
def get_chunks_by_ids(self, _chunk_ids, category=None):
|
||||
assert category is None
|
||||
return [
|
||||
{
|
||||
"chunk_id": 7,
|
||||
"file_path": "src/app.py",
|
||||
"content": "def app(): pass",
|
||||
"metadata": {},
|
||||
"start_line": 1,
|
||||
"end_line": 1,
|
||||
}
|
||||
]
|
||||
|
||||
import codexlens.storage.vector_meta_store as vector_meta_store
|
||||
|
||||
monkeypatch.setattr(vector_meta_store, "VectorMetadataStore", FakeMetaStore)
|
||||
|
||||
results = engine._fetch_from_vector_meta_store(
|
||||
temp_paths / "_vectors_meta.db",
|
||||
[7],
|
||||
{7: -0.01},
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].path == "src/app.py"
|
||||
assert results[0].score == 0.0
|
||||
|
||||
|
||||
class TestCentralizedVectorCaching:
|
||||
"""Tests for centralized vector search runtime caches."""
|
||||
|
||||
def test_search_vector_centralized_reuses_cached_resources(
|
||||
self,
|
||||
temp_paths,
|
||||
mock_config,
|
||||
):
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
hnsw_path = temp_paths / "_vectors.hnsw"
|
||||
hnsw_path.write_bytes(b"hnsw")
|
||||
|
||||
vector_store_opened: List[Path] = []
|
||||
|
||||
class FakeVectorStore:
|
||||
def __init__(self, path):
|
||||
vector_store_opened.append(Path(path))
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
def get_model_config(self):
|
||||
return {
|
||||
"backend": "fastembed",
|
||||
"model_name": "BAAI/bge-small-en-v1.5",
|
||||
"model_profile": "fast",
|
||||
"embedding_dim": 384,
|
||||
}
|
||||
|
||||
class FakeEmbedder:
|
||||
embedding_dim = 384
|
||||
|
||||
def __init__(self):
|
||||
self.embed_calls: List[str] = []
|
||||
|
||||
def embed_single(self, query):
|
||||
self.embed_calls.append(query)
|
||||
return [0.1, 0.2, 0.3]
|
||||
|
||||
class FakeAnnIndex:
|
||||
def __init__(self):
|
||||
self.load_calls = 0
|
||||
self.search_calls = 0
|
||||
|
||||
def load(self):
|
||||
self.load_calls += 1
|
||||
return True
|
||||
|
||||
def count(self):
|
||||
return 3
|
||||
|
||||
def search(self, _query_vec, top_k):
|
||||
self.search_calls += 1
|
||||
assert top_k == 10
|
||||
return [7], [0.2]
|
||||
|
||||
fake_embedder = FakeEmbedder()
|
||||
fake_ann_index = FakeAnnIndex()
|
||||
|
||||
with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch(
|
||||
"codexlens.semantic.factory.get_embedder",
|
||||
return_value=fake_embedder,
|
||||
) as mock_get_embedder, patch(
|
||||
"codexlens.semantic.ann_index.ANNIndex.create_central",
|
||||
return_value=fake_ann_index,
|
||||
) as mock_create_central, patch.object(
|
||||
engine,
|
||||
"_fetch_chunks_by_ids_centralized",
|
||||
return_value=[SearchResult(path="src/app.py", score=0.8, excerpt="hit")],
|
||||
) as mock_fetch:
|
||||
first = engine._search_vector_centralized(
|
||||
temp_paths / "child-a" / "_index.db",
|
||||
hnsw_path,
|
||||
"smart search routing",
|
||||
limit=5,
|
||||
)
|
||||
second = engine._search_vector_centralized(
|
||||
temp_paths / "child-b" / "_index.db",
|
||||
hnsw_path,
|
||||
"smart search routing",
|
||||
limit=5,
|
||||
)
|
||||
|
||||
assert [result.path for result in first] == ["src/app.py"]
|
||||
assert [result.path for result in second] == ["src/app.py"]
|
||||
assert vector_store_opened == [temp_paths / "_index.db"]
|
||||
assert mock_get_embedder.call_count == 1
|
||||
assert mock_create_central.call_count == 1
|
||||
assert fake_ann_index.load_calls == 1
|
||||
assert fake_embedder.embed_calls == ["smart search routing"]
|
||||
assert fake_ann_index.search_calls == 2
|
||||
assert mock_fetch.call_count == 2
|
||||
|
||||
def test_search_vector_centralized_respects_embedding_use_gpu(
|
||||
self,
|
||||
temp_paths,
|
||||
mock_config,
|
||||
):
|
||||
engine = HybridSearchEngine(config=mock_config)
|
||||
hnsw_path = temp_paths / "_vectors.hnsw"
|
||||
hnsw_path.write_bytes(b"hnsw")
|
||||
|
||||
class FakeVectorStore:
|
||||
def __init__(self, _path):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
def get_model_config(self):
|
||||
return {
|
||||
"backend": "fastembed",
|
||||
"model_name": "BAAI/bge-small-en-v1.5",
|
||||
"model_profile": "code",
|
||||
"embedding_dim": 384,
|
||||
}
|
||||
|
||||
class FakeEmbedder:
|
||||
embedding_dim = 384
|
||||
|
||||
def embed_single(self, _query):
|
||||
return [0.1, 0.2]
|
||||
|
||||
class FakeAnnIndex:
|
||||
def load(self):
|
||||
return True
|
||||
|
||||
def count(self):
|
||||
return 1
|
||||
|
||||
def search(self, _query_vec, top_k):
|
||||
assert top_k == 6
|
||||
return [9], [0.1]
|
||||
|
||||
with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch(
|
||||
"codexlens.semantic.factory.get_embedder",
|
||||
return_value=FakeEmbedder(),
|
||||
) as mock_get_embedder, patch(
|
||||
"codexlens.semantic.ann_index.ANNIndex.create_central",
|
||||
return_value=FakeAnnIndex(),
|
||||
), patch.object(
|
||||
engine,
|
||||
"_fetch_chunks_by_ids_centralized",
|
||||
return_value=[SearchResult(path="src/app.py", score=0.9, excerpt="hit")],
|
||||
):
|
||||
results = engine._search_vector_centralized(
|
||||
temp_paths / "_index.db",
|
||||
hnsw_path,
|
||||
"semantic query",
|
||||
limit=3,
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert mock_get_embedder.call_count == 1
|
||||
assert mock_get_embedder.call_args.kwargs == {
|
||||
"backend": "fastembed",
|
||||
"profile": "code",
|
||||
"use_gpu": False,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: _search_lsp_graph
|
||||
# =============================================================================
|
||||
|
||||
674
codex-lens/tests/test_index_status_cli_contract.py
Normal file
674
codex-lens/tests/test_index_status_cli_contract.py
Normal file
@@ -0,0 +1,674 @@
|
||||
import json
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
import codexlens.cli.commands as commands
|
||||
from codexlens.cli.commands import app
|
||||
import codexlens.cli.embedding_manager as embedding_manager
|
||||
from codexlens.config import Config
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.search.chain_search import ChainSearchResult, SearchStats
|
||||
|
||||
|
||||
def test_index_status_json_preserves_legacy_embeddings_contract(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
(workspace / "_index.db").touch()
|
||||
|
||||
legacy_summary = {
|
||||
"total_indexes": 3,
|
||||
"indexes_with_embeddings": 1,
|
||||
"total_chunks": 42,
|
||||
"indexes": [
|
||||
{
|
||||
"project": "child",
|
||||
"path": str(workspace / "child" / "_index.db"),
|
||||
"has_embeddings": True,
|
||||
"total_chunks": 42,
|
||||
"total_files": 1,
|
||||
"coverage_percent": 100.0,
|
||||
}
|
||||
],
|
||||
}
|
||||
root_status = {
|
||||
"total_indexes": 3,
|
||||
"total_files": 2,
|
||||
"files_with_embeddings": 0,
|
||||
"files_without_embeddings": 2,
|
||||
"total_chunks": 0,
|
||||
"coverage_percent": 0.0,
|
||||
"indexes_with_embeddings": 1,
|
||||
"indexes_without_embeddings": 2,
|
||||
"model_info": None,
|
||||
"root": {
|
||||
"index_path": str(workspace / "_index.db"),
|
||||
"exists": False,
|
||||
"total_files": 2,
|
||||
"files_with_embeddings": 0,
|
||||
"files_without_embeddings": 2,
|
||||
"total_chunks": 0,
|
||||
"coverage_percent": 0.0,
|
||||
"has_embeddings": False,
|
||||
"storage_mode": "none",
|
||||
},
|
||||
"subtree": {
|
||||
"total_indexes": 3,
|
||||
"total_files": 3,
|
||||
"files_with_embeddings": 1,
|
||||
"files_without_embeddings": 2,
|
||||
"total_chunks": 42,
|
||||
"coverage_percent": 33.3,
|
||||
"indexes_with_embeddings": 1,
|
||||
"indexes_without_embeddings": 2,
|
||||
},
|
||||
"centralized": {
|
||||
"dense_index_exists": False,
|
||||
"binary_index_exists": False,
|
||||
"dense_ready": False,
|
||||
"binary_ready": False,
|
||||
"usable": False,
|
||||
"chunk_metadata_rows": 0,
|
||||
"binary_vector_rows": 0,
|
||||
"files_with_embeddings": 0,
|
||||
},
|
||||
}
|
||||
|
||||
monkeypatch.setattr(
|
||||
embedding_manager,
|
||||
"get_embeddings_status",
|
||||
lambda _index_root: {"success": True, "result": root_status},
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
embedding_manager,
|
||||
"get_embedding_stats_summary",
|
||||
lambda _index_root: {"success": True, "result": legacy_summary},
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type(
|
||||
"FakeRegistryStore",
|
||||
(),
|
||||
{
|
||||
"initialize": lambda self: None,
|
||||
"close": lambda self: None,
|
||||
},
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"PathMapper",
|
||||
type(
|
||||
"FakePathMapper",
|
||||
(),
|
||||
{
|
||||
"source_to_index_db": lambda self, _target_path: workspace / "_index.db",
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, ["index", "status", str(workspace), "--json"])
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
payload = json.loads(result.stdout)
|
||||
body = payload["result"]
|
||||
assert body["embeddings"] == legacy_summary
|
||||
assert body["embeddings_error"] is None
|
||||
assert body["embeddings_status"] == root_status
|
||||
assert body["embeddings_status_error"] is None
|
||||
assert body["embeddings_summary"] == legacy_summary
|
||||
|
||||
|
||||
def test_search_json_preserves_dense_rerank_method_label(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
search_result = ChainSearchResult(
|
||||
query="greet function",
|
||||
results=[
|
||||
SearchResult(
|
||||
path=str(workspace / "src" / "app.py"),
|
||||
score=0.97,
|
||||
excerpt="def greet(name):",
|
||||
content="def greet(name):\n return f'hello {name}'\n",
|
||||
)
|
||||
],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=12.5),
|
||||
)
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type(
|
||||
"FakeRegistryStore",
|
||||
(),
|
||||
{
|
||||
"initialize": lambda self: None,
|
||||
"close": lambda self: None,
|
||||
},
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"PathMapper",
|
||||
type(
|
||||
"FakePathMapper",
|
||||
(),
|
||||
{},
|
||||
),
|
||||
)
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["registry"] = registry
|
||||
captured["mapper"] = mapper
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, *_args, **_kwargs):
|
||||
raise AssertionError("dense_rerank should dispatch via cascade_search")
|
||||
|
||||
def cascade_search(self, query, source_path, k=10, options=None, strategy=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["limit"] = k
|
||||
captured["options"] = options
|
||||
captured["strategy"] = strategy
|
||||
return search_result
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "greet function", "--path", str(workspace), "--method", "dense_rerank", "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
payload = json.loads(result.stdout)
|
||||
body = payload["result"]
|
||||
assert body["method"] == "dense_rerank"
|
||||
assert body["count"] == 1
|
||||
assert body["results"][0]["path"] == str(workspace / "src" / "app.py")
|
||||
assert captured["strategy"] == "dense_rerank"
|
||||
assert captured["limit"] == 20
|
||||
|
||||
|
||||
def test_search_json_auto_routes_keyword_queries_to_fts(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
search_result = ChainSearchResult(
|
||||
query="windowsHide",
|
||||
results=[
|
||||
SearchResult(
|
||||
path=str(workspace / "src" / "spawn.ts"),
|
||||
score=0.91,
|
||||
excerpt="windowsHide: true",
|
||||
content="spawn('node', [], { windowsHide: true })",
|
||||
)
|
||||
],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=8.0),
|
||||
)
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}),
|
||||
)
|
||||
monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {}))
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, query, source_path, options=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["options"] = options
|
||||
return search_result
|
||||
|
||||
def cascade_search(self, *_args, **_kwargs):
|
||||
raise AssertionError("auto keyword queries should not dispatch to cascade_search")
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "windowsHide", "--path", str(workspace), "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
body = json.loads(result.stdout)["result"]
|
||||
assert body["method"] == "fts"
|
||||
assert captured["options"].enable_vector is False
|
||||
assert captured["options"].hybrid_mode is False
|
||||
|
||||
|
||||
def test_search_json_auto_routes_mixed_queries_to_hybrid(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
search_result = ChainSearchResult(
|
||||
query="how does my_function work",
|
||||
results=[
|
||||
SearchResult(
|
||||
path=str(workspace / "src" / "app.py"),
|
||||
score=0.81,
|
||||
excerpt="def my_function():",
|
||||
content="def my_function():\n return 1\n",
|
||||
)
|
||||
],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=10.0),
|
||||
)
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}),
|
||||
)
|
||||
monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {}))
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, query, source_path, options=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["options"] = options
|
||||
return search_result
|
||||
|
||||
def cascade_search(self, *_args, **_kwargs):
|
||||
raise AssertionError("mixed auto queries should not dispatch to cascade_search")
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "how does my_function work", "--path", str(workspace), "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
body = json.loads(result.stdout)["result"]
|
||||
assert body["method"] == "hybrid"
|
||||
assert captured["options"].enable_vector is True
|
||||
assert captured["options"].hybrid_mode is True
|
||||
assert captured["options"].enable_cascade is False
|
||||
|
||||
|
||||
def test_search_json_auto_routes_generated_artifact_queries_to_fts(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
search_result = ChainSearchResult(
|
||||
query="dist bundle output",
|
||||
results=[
|
||||
SearchResult(
|
||||
path=str(workspace / "dist" / "bundle.js"),
|
||||
score=0.77,
|
||||
excerpt="bundle output",
|
||||
content="console.log('bundle')",
|
||||
)
|
||||
],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=9.0),
|
||||
)
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}),
|
||||
)
|
||||
monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {}))
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, query, source_path, options=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["options"] = options
|
||||
return search_result
|
||||
|
||||
def cascade_search(self, *_args, **_kwargs):
|
||||
raise AssertionError("generated artifact auto queries should not dispatch to cascade_search")
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "dist bundle output", "--path", str(workspace), "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
body = json.loads(result.stdout)["result"]
|
||||
assert body["method"] == "fts"
|
||||
assert captured["options"].enable_vector is False
|
||||
assert captured["options"].hybrid_mode is False
|
||||
|
||||
|
||||
def test_auto_select_search_method_prefers_fts_for_lexical_config_queries() -> None:
|
||||
assert commands._auto_select_search_method("embedding backend fastembed local litellm api config") == "fts"
|
||||
assert commands._auto_select_search_method("get_reranker factory onnx backend selection") == "fts"
|
||||
assert commands._auto_select_search_method("how to authenticate users safely?") == "dense_rerank"
|
||||
|
||||
|
||||
def test_search_json_fts_zero_results_uses_filesystem_fallback(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
indexed_result = ChainSearchResult(
|
||||
query="find_descendant_project_roots",
|
||||
results=[],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=3, files_matched=0, time_ms=7.5),
|
||||
)
|
||||
fallback_result = SearchResult(
|
||||
path=str(workspace / "src" / "registry.py"),
|
||||
score=1.0,
|
||||
excerpt="def find_descendant_project_roots(...):",
|
||||
content=None,
|
||||
metadata={
|
||||
"filesystem_fallback": True,
|
||||
"backend": "ripgrep-fallback",
|
||||
"stale_index_suspected": True,
|
||||
},
|
||||
start_line=12,
|
||||
end_line=12,
|
||||
)
|
||||
captured: dict[str, object] = {"fallback_calls": 0}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}),
|
||||
)
|
||||
monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {}))
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, query, source_path, options=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["options"] = options
|
||||
return indexed_result
|
||||
|
||||
def cascade_search(self, *_args, **_kwargs):
|
||||
raise AssertionError("fts zero-result queries should not dispatch to cascade_search")
|
||||
|
||||
def fake_fallback(query, source_path, *, limit, config, code_only=False, exclude_extensions=None):
|
||||
captured["fallback_calls"] = int(captured["fallback_calls"]) + 1
|
||||
captured["fallback_query"] = query
|
||||
captured["fallback_path"] = source_path
|
||||
captured["fallback_limit"] = limit
|
||||
captured["fallback_code_only"] = code_only
|
||||
captured["fallback_exclude_extensions"] = exclude_extensions
|
||||
return {
|
||||
"results": [fallback_result],
|
||||
"time_ms": 2.5,
|
||||
"fallback": {
|
||||
"backend": "ripgrep-fallback",
|
||||
"stale_index_suspected": True,
|
||||
"reason": "Indexed FTS search returned no results; filesystem fallback used.",
|
||||
},
|
||||
}
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
monkeypatch.setattr(commands, "_filesystem_fallback_search", fake_fallback)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "find_descendant_project_roots", "--method", "fts", "--path", str(workspace), "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
body = json.loads(result.stdout)["result"]
|
||||
assert body["method"] == "fts"
|
||||
assert body["count"] == 1
|
||||
assert body["results"][0]["path"] == str(workspace / "src" / "registry.py")
|
||||
assert body["results"][0]["excerpt"] == "def find_descendant_project_roots(...):"
|
||||
assert body["stats"]["files_matched"] == 1
|
||||
assert body["stats"]["time_ms"] == 10.0
|
||||
assert body["fallback"] == {
|
||||
"backend": "ripgrep-fallback",
|
||||
"stale_index_suspected": True,
|
||||
"reason": "Indexed FTS search returned no results; filesystem fallback used.",
|
||||
}
|
||||
assert captured["fallback_calls"] == 1
|
||||
assert captured["fallback_query"] == "find_descendant_project_roots"
|
||||
assert captured["fallback_path"] == workspace
|
||||
assert captured["fallback_limit"] == 20
|
||||
assert captured["options"].enable_vector is False
|
||||
assert captured["options"].hybrid_mode is False
|
||||
|
||||
|
||||
def test_search_json_hybrid_zero_results_does_not_use_filesystem_fallback(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
indexed_result = ChainSearchResult(
|
||||
query="how does my_function work",
|
||||
results=[],
|
||||
symbols=[],
|
||||
stats=SearchStats(dirs_searched=4, files_matched=0, time_ms=11.0),
|
||||
)
|
||||
captured: dict[str, object] = {"fallback_calls": 0}
|
||||
|
||||
monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data")))
|
||||
monkeypatch.setattr(
|
||||
commands,
|
||||
"RegistryStore",
|
||||
type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}),
|
||||
)
|
||||
monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {}))
|
||||
|
||||
class FakeChainSearchEngine:
|
||||
def __init__(self, registry, mapper, config=None):
|
||||
captured["config"] = config
|
||||
|
||||
def search(self, query, source_path, options=None):
|
||||
captured["query"] = query
|
||||
captured["source_path"] = source_path
|
||||
captured["options"] = options
|
||||
return indexed_result
|
||||
|
||||
def cascade_search(self, *_args, **_kwargs):
|
||||
raise AssertionError("hybrid queries should not dispatch to cascade_search")
|
||||
|
||||
def fake_fallback(*_args, **_kwargs):
|
||||
captured["fallback_calls"] = int(captured["fallback_calls"]) + 1
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine)
|
||||
monkeypatch.setattr(commands, "_filesystem_fallback_search", fake_fallback)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["search", "how does my_function work", "--path", str(workspace), "--json"],
|
||||
)
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
body = json.loads(result.stdout)["result"]
|
||||
assert body["method"] == "hybrid"
|
||||
assert body["count"] == 0
|
||||
assert "fallback" not in body
|
||||
assert body["stats"]["files_matched"] == 0
|
||||
assert body["stats"]["time_ms"] == 11.0
|
||||
assert captured["fallback_calls"] == 0
|
||||
assert captured["options"].enable_vector is True
|
||||
assert captured["options"].hybrid_mode is True
|
||||
|
||||
|
||||
def test_filesystem_fallback_search_prefers_source_definitions_for_keyword_queries(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
source_path = workspace / "src" / "registry.py"
|
||||
test_path = workspace / "tests" / "test_registry.py"
|
||||
ref_path = workspace / "src" / "chain_search.py"
|
||||
|
||||
match_lines = [
|
||||
{
|
||||
"type": "match",
|
||||
"data": {
|
||||
"path": {"text": str(test_path)},
|
||||
"lines": {"text": "def test_find_descendant_project_roots_returns_nested_project_roots():\n"},
|
||||
"line_number": 12,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "match",
|
||||
"data": {
|
||||
"path": {"text": str(source_path)},
|
||||
"lines": {"text": "def find_descendant_project_roots(self, source_root: Path) -> List[DirMapping]:\n"},
|
||||
"line_number": 48,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "match",
|
||||
"data": {
|
||||
"path": {"text": str(ref_path)},
|
||||
"lines": {"text": "descendant_roots = self.registry.find_descendant_project_roots(source_root)\n"},
|
||||
"line_number": 91,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
monkeypatch.setattr(commands.shutil, "which", lambda _name: "rg")
|
||||
monkeypatch.setattr(
|
||||
commands.subprocess,
|
||||
"run",
|
||||
lambda *_args, **_kwargs: type(
|
||||
"FakeCompletedProcess",
|
||||
(),
|
||||
{
|
||||
"returncode": 0,
|
||||
"stdout": "\n".join(json.dumps(line) for line in match_lines),
|
||||
"stderr": "",
|
||||
},
|
||||
)(),
|
||||
)
|
||||
|
||||
fallback = commands._filesystem_fallback_search(
|
||||
"find_descendant_project_roots",
|
||||
workspace,
|
||||
limit=5,
|
||||
config=Config(data_dir=tmp_path / "data"),
|
||||
)
|
||||
|
||||
assert fallback is not None
|
||||
assert fallback["fallback"]["backend"] == "ripgrep-fallback"
|
||||
assert fallback["results"][0].path == str(source_path)
|
||||
assert fallback["results"][1].path == str(ref_path)
|
||||
assert fallback["results"][2].path == str(test_path)
|
||||
assert fallback["results"][0].score > fallback["results"][1].score > fallback["results"][2].score
|
||||
|
||||
|
||||
def test_clean_json_reports_partial_success_when_locked_files_remain(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
project_index = tmp_path / "indexes" / "workspace"
|
||||
project_index.mkdir(parents=True)
|
||||
(project_index / "_index.db").write_text("db", encoding="utf-8")
|
||||
locked_path = project_index / "nested" / "_index.db"
|
||||
locked_path.parent.mkdir(parents=True)
|
||||
locked_path.write_text("locked", encoding="utf-8")
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class FakePathMapper:
|
||||
def __init__(self):
|
||||
self.index_root = tmp_path / "indexes"
|
||||
|
||||
def source_to_index_dir(self, source_path):
|
||||
captured["mapped_source"] = source_path
|
||||
return project_index
|
||||
|
||||
class FakeRegistryStore:
|
||||
def initialize(self):
|
||||
captured["registry_initialized"] = True
|
||||
|
||||
def unregister_project(self, source_path):
|
||||
captured["unregistered_project"] = source_path
|
||||
return True
|
||||
|
||||
def close(self):
|
||||
captured["registry_closed"] = True
|
||||
|
||||
def fake_remove_tree(target):
|
||||
captured["removed_target"] = target
|
||||
return {
|
||||
"removed": False,
|
||||
"partial": True,
|
||||
"locked_paths": [str(locked_path)],
|
||||
"remaining_path": str(project_index),
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
monkeypatch.setattr(commands, "PathMapper", FakePathMapper)
|
||||
monkeypatch.setattr(commands, "RegistryStore", FakeRegistryStore)
|
||||
monkeypatch.setattr(commands, "_remove_tree_best_effort", fake_remove_tree)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, ["clean", str(workspace), "--json"])
|
||||
|
||||
assert result.exit_code == 0, result.output
|
||||
payload = json.loads(result.stdout)
|
||||
body = payload["result"]
|
||||
assert payload["success"] is True
|
||||
assert body["cleaned"] == str(workspace.resolve())
|
||||
assert body["index_path"] == str(project_index)
|
||||
assert body["partial"] is True
|
||||
assert body["locked_paths"] == [str(locked_path)]
|
||||
assert body["remaining_path"] == str(project_index)
|
||||
assert captured["registry_initialized"] is True
|
||||
assert captured["registry_closed"] is True
|
||||
assert captured["unregistered_project"] == workspace.resolve()
|
||||
assert captured["removed_target"] == project_index
|
||||
@@ -5,7 +5,10 @@ from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.storage.index_tree import IndexTreeBuilder
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.index_tree import DirBuildResult, IndexTreeBuilder
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
|
||||
|
||||
def _relative_dirs(source_root: Path, dirs_by_depth: dict[int, list[Path]]) -> set[str]:
|
||||
@@ -145,3 +148,148 @@ def test_builder_loads_saved_ignore_and_extension_filters_by_default(tmp_path: P
|
||||
|
||||
assert [path.name for path in source_files] == ["app.ts"]
|
||||
assert "frontend/dist" not in discovered_dirs
|
||||
|
||||
|
||||
def test_prune_stale_project_dirs_removes_ignored_artifact_mappings(tmp_path: Path) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
src_dir = workspace / "src"
|
||||
dist_dir = workspace / "dist"
|
||||
src_dir.mkdir(parents=True)
|
||||
dist_dir.mkdir(parents=True)
|
||||
(src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8")
|
||||
(dist_dir / "bundle.py").write_text("print('artifact')\n", encoding="utf-8")
|
||||
|
||||
mapper = PathMapper(index_root=tmp_path / "indexes")
|
||||
registry = RegistryStore(db_path=tmp_path / "registry.db")
|
||||
registry.initialize()
|
||||
project = registry.register_project(workspace, mapper.source_to_index_dir(workspace))
|
||||
registry.register_dir(project.id, workspace, mapper.source_to_index_db(workspace), depth=0)
|
||||
registry.register_dir(project.id, src_dir, mapper.source_to_index_db(src_dir), depth=1)
|
||||
registry.register_dir(project.id, dist_dir, mapper.source_to_index_db(dist_dir), depth=1)
|
||||
|
||||
builder = IndexTreeBuilder(
|
||||
registry=registry,
|
||||
mapper=mapper,
|
||||
config=Config(data_dir=tmp_path / "data"),
|
||||
incremental=False,
|
||||
)
|
||||
|
||||
dirs_by_depth = builder._collect_dirs_by_depth(workspace)
|
||||
pruned = builder._prune_stale_project_dirs(
|
||||
project_id=project.id,
|
||||
source_root=workspace,
|
||||
dirs_by_depth=dirs_by_depth,
|
||||
)
|
||||
|
||||
remaining = {mapping.source_path.resolve() for mapping in registry.get_project_dirs(project.id)}
|
||||
registry.close()
|
||||
|
||||
assert dist_dir.resolve() in pruned
|
||||
assert workspace.resolve() in remaining
|
||||
assert src_dir.resolve() in remaining
|
||||
assert dist_dir.resolve() not in remaining
|
||||
|
||||
|
||||
def test_force_full_build_prunes_stale_ignored_mappings(tmp_path: Path) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
src_dir = workspace / "src"
|
||||
dist_dir = workspace / "dist"
|
||||
src_dir.mkdir(parents=True)
|
||||
dist_dir.mkdir(parents=True)
|
||||
(src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8")
|
||||
(dist_dir / "bundle.py").write_text("print('artifact')\n", encoding="utf-8")
|
||||
|
||||
mapper = PathMapper(index_root=tmp_path / "indexes")
|
||||
registry = RegistryStore(db_path=tmp_path / "registry.db")
|
||||
registry.initialize()
|
||||
project = registry.register_project(workspace, mapper.source_to_index_dir(workspace))
|
||||
registry.register_dir(project.id, workspace, mapper.source_to_index_db(workspace), depth=0)
|
||||
registry.register_dir(project.id, dist_dir, mapper.source_to_index_db(dist_dir), depth=1)
|
||||
|
||||
builder = IndexTreeBuilder(
|
||||
registry=registry,
|
||||
mapper=mapper,
|
||||
config=Config(
|
||||
data_dir=tmp_path / "data",
|
||||
global_symbol_index_enabled=False,
|
||||
),
|
||||
incremental=False,
|
||||
)
|
||||
|
||||
def fake_build_level_parallel(
|
||||
dirs: list[Path],
|
||||
languages,
|
||||
workers,
|
||||
*,
|
||||
source_root: Path,
|
||||
project_id: int,
|
||||
global_index_db_path: Path,
|
||||
) -> list[DirBuildResult]:
|
||||
return [
|
||||
DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=mapper.source_to_index_db(dir_path),
|
||||
files_count=1 if dir_path == src_dir else 0,
|
||||
symbols_count=0,
|
||||
subdirs=[],
|
||||
)
|
||||
for dir_path in dirs
|
||||
]
|
||||
|
||||
builder._build_level_parallel = fake_build_level_parallel # type: ignore[method-assign]
|
||||
builder._link_children_to_parent = MagicMock()
|
||||
|
||||
build_result = builder.build(workspace, force_full=True, workers=1)
|
||||
|
||||
remaining = {mapping.source_path.resolve() for mapping in registry.get_project_dirs(project.id)}
|
||||
registry.close()
|
||||
|
||||
assert build_result.total_dirs == 2
|
||||
assert workspace.resolve() in remaining
|
||||
assert src_dir.resolve() in remaining
|
||||
assert dist_dir.resolve() not in remaining
|
||||
|
||||
|
||||
def test_force_full_build_rewrites_directory_db_and_drops_stale_ignored_subdirs(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
project_root = tmp_path / "project"
|
||||
src_dir = project_root / "src"
|
||||
build_dir = project_root / "build"
|
||||
src_dir.mkdir(parents=True)
|
||||
build_dir.mkdir(parents=True)
|
||||
(src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8")
|
||||
(build_dir / "generated.py").write_text("print('artifact')\n", encoding="utf-8")
|
||||
|
||||
mapper = PathMapper(index_root=tmp_path / "indexes")
|
||||
registry = RegistryStore(db_path=tmp_path / "registry.db")
|
||||
registry.initialize()
|
||||
config = Config(
|
||||
data_dir=tmp_path / "data",
|
||||
global_symbol_index_enabled=False,
|
||||
)
|
||||
|
||||
root_index_db = mapper.source_to_index_db(project_root)
|
||||
with DirIndexStore(root_index_db, config=config) as store:
|
||||
store.register_subdir(
|
||||
name="build",
|
||||
index_path=mapper.source_to_index_db(build_dir),
|
||||
files_count=1,
|
||||
)
|
||||
|
||||
builder = IndexTreeBuilder(
|
||||
registry=registry,
|
||||
mapper=mapper,
|
||||
config=config,
|
||||
incremental=False,
|
||||
)
|
||||
|
||||
build_result = builder.build(project_root, force_full=True, workers=1)
|
||||
|
||||
with DirIndexStore(root_index_db, config=config) as store:
|
||||
subdir_names = [link.name for link in store.get_subdirs()]
|
||||
|
||||
registry.close()
|
||||
|
||||
assert build_result.total_dirs == 2
|
||||
assert subdir_names == ["src"]
|
||||
|
||||
@@ -24,13 +24,24 @@ from codexlens.entities import SearchResult
|
||||
from codexlens.search.ranking import (
|
||||
DEFAULT_WEIGHTS,
|
||||
QueryIntent,
|
||||
apply_path_penalties,
|
||||
extract_explicit_path_hints,
|
||||
cross_encoder_rerank,
|
||||
adjust_weights_by_intent,
|
||||
apply_symbol_boost,
|
||||
detect_query_intent,
|
||||
filter_results_by_category,
|
||||
get_rrf_weights,
|
||||
group_similar_results,
|
||||
is_auxiliary_reference_path,
|
||||
is_generated_artifact_path,
|
||||
is_test_file,
|
||||
normalize_weights,
|
||||
query_prefers_lexical_search,
|
||||
query_targets_auxiliary_files,
|
||||
query_targets_generated_files,
|
||||
query_targets_test_files,
|
||||
rebalance_noisy_results,
|
||||
reciprocal_rank_fusion,
|
||||
simple_weighted_fusion,
|
||||
)
|
||||
@@ -73,6 +84,7 @@ class TestDetectQueryIntent:
|
||||
def test_detect_keyword_intent(self):
|
||||
"""CamelCase/underscore queries should be detected as KEYWORD."""
|
||||
assert detect_query_intent("MyClassName") == QueryIntent.KEYWORD
|
||||
assert detect_query_intent("windowsHide") == QueryIntent.KEYWORD
|
||||
assert detect_query_intent("my_function_name") == QueryIntent.KEYWORD
|
||||
assert detect_query_intent("foo::bar") == QueryIntent.KEYWORD
|
||||
|
||||
@@ -91,6 +103,25 @@ class TestDetectQueryIntent:
|
||||
assert detect_query_intent("") == QueryIntent.MIXED
|
||||
assert detect_query_intent(" ") == QueryIntent.MIXED
|
||||
|
||||
def test_query_targets_test_files(self):
|
||||
"""Queries explicitly mentioning tests should skip test penalties."""
|
||||
assert query_targets_test_files("how do tests cover auth flow?")
|
||||
assert query_targets_test_files("spec fixtures for parser")
|
||||
assert not query_targets_test_files("windowsHide")
|
||||
|
||||
def test_query_targets_generated_files(self):
|
||||
"""Queries explicitly mentioning build artifacts should skip that penalty."""
|
||||
assert query_targets_generated_files("inspect dist bundle output")
|
||||
assert query_targets_generated_files("generated artifacts under build")
|
||||
assert not query_targets_generated_files("cache invalidation strategy")
|
||||
|
||||
def test_query_prefers_lexical_search(self):
|
||||
"""Config/env/factory queries should prefer lexical-first routing."""
|
||||
assert query_prefers_lexical_search("embedding backend fastembed local litellm api config")
|
||||
assert query_prefers_lexical_search("get_reranker factory onnx backend selection")
|
||||
assert query_prefers_lexical_search("EMBEDDING_BACKEND and RERANKER_BACKEND environment variables")
|
||||
assert not query_prefers_lexical_search("how does smart search route keyword queries")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: adjust_weights_by_intent
|
||||
@@ -129,6 +160,427 @@ class TestAdjustWeightsByIntent:
|
||||
assert adjusted["exact"] == pytest.approx(0.3, abs=0.01)
|
||||
|
||||
|
||||
class TestPathPenalties:
|
||||
"""Tests for lightweight path-based ranking penalties."""
|
||||
|
||||
def test_is_test_file(self):
|
||||
assert is_test_file("/repo/tests/test_auth.py")
|
||||
assert is_test_file("D:\\repo\\src\\auth.spec.ts")
|
||||
assert is_test_file("/repo/frontend/src/pages/discoverypage.test.tsx")
|
||||
assert is_test_file("/repo/frontend/src/pages/discoverypage.spec.jsx")
|
||||
assert not is_test_file("/repo/src/auth.py")
|
||||
|
||||
def test_is_generated_artifact_path(self):
|
||||
assert is_generated_artifact_path("/repo/dist/app.js")
|
||||
assert is_generated_artifact_path("/repo/src/generated/client.ts")
|
||||
assert is_generated_artifact_path("D:\\repo\\frontend\\.next\\server.js")
|
||||
assert not is_generated_artifact_path("/repo/src/auth.py")
|
||||
|
||||
def test_is_auxiliary_reference_path(self):
|
||||
assert is_auxiliary_reference_path("/repo/examples/auth_demo.py")
|
||||
assert is_auxiliary_reference_path("/repo/benchmarks/search_eval.py")
|
||||
assert is_auxiliary_reference_path("/repo/tools/debug_search.py")
|
||||
assert not is_auxiliary_reference_path("/repo/src/auth.py")
|
||||
|
||||
def test_query_targets_auxiliary_files(self):
|
||||
assert query_targets_auxiliary_files("show smart search examples")
|
||||
assert query_targets_auxiliary_files("benchmark smart search")
|
||||
assert not query_targets_auxiliary_files("smart search routing")
|
||||
|
||||
def test_apply_path_penalties_demotes_test_files(self):
|
||||
results = [
|
||||
_make_result(path="/repo/tests/test_auth.py", score=10.0),
|
||||
_make_result(path="/repo/src/auth.py", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"authenticate user",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/auth.py"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
|
||||
def test_apply_path_penalties_more_aggressively_demotes_tests_for_keyword_queries(self):
|
||||
results = [
|
||||
_make_result(path="/repo/tests/test_auth.py", score=5.0),
|
||||
_make_result(path="/repo/src/auth.py", score=4.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"find_descendant_project_roots",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/auth.py"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.55)
|
||||
assert penalized[1].metadata["path_rank_multiplier"] == pytest.approx(0.55)
|
||||
|
||||
def test_apply_path_penalties_more_aggressively_demotes_tests_for_semantic_queries(self):
|
||||
results = [
|
||||
_make_result(path="/repo/tests/test_auth.py", score=5.0),
|
||||
_make_result(path="/repo/src/auth.py", score=4.1),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"how does auth routing work",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/auth.py"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.75)
|
||||
|
||||
def test_apply_path_penalties_boosts_source_definitions_for_identifier_queries(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/tests/test_registry.py",
|
||||
score=4.2,
|
||||
excerpt='query="find_descendant_project_roots"',
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/registry.py",
|
||||
score=3.0,
|
||||
excerpt="def find_descendant_project_roots(self, source_root: Path) -> list[str]:",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"find_descendant_project_roots",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/registry.py"
|
||||
assert penalized[0].metadata["path_boost_reasons"] == ["source_definition"]
|
||||
assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(2.0)
|
||||
assert penalized[0].metadata["path_rank_multiplier"] == pytest.approx(2.0)
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
|
||||
def test_apply_path_penalties_boosts_source_paths_for_semantic_feature_queries(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/tests/smart-search-intent.test.js",
|
||||
score=0.832,
|
||||
excerpt="describes how smart search routes keyword queries",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.555,
|
||||
excerpt="smart search keyword routing logic",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"how does smart search route keyword queries",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"]
|
||||
assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(1.35)
|
||||
assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"]
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
|
||||
def test_apply_path_penalties_strongly_boosts_keyword_basename_overlap(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/src/tools/core-memory.ts",
|
||||
score=0.04032417772512223,
|
||||
excerpt="memory listing helpers",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.009836065573770493,
|
||||
excerpt="smart search keyword routing logic",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"executeHybridMode dense_rerank semantic smart_search",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"]
|
||||
assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(4.5)
|
||||
assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"]
|
||||
|
||||
def test_extract_explicit_path_hints_ignores_generic_platform_terms(self):
|
||||
assert extract_explicit_path_hints(
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
) == [["smart", "search"]]
|
||||
|
||||
def test_apply_path_penalties_prefers_explicit_feature_hint_over_platform_terms(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/src/tools/codex-lens-lsp.ts",
|
||||
score=0.045,
|
||||
excerpt="CodexLens LSP bridge",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.03,
|
||||
excerpt="parse JSON output and strip ANSI for plain-text fallback",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"]
|
||||
assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"]
|
||||
|
||||
def test_apply_path_penalties_strongly_boosts_lexical_config_modules(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=22.07,
|
||||
excerpt="embedding backend local api config routing",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/codexlens/config.py",
|
||||
score=4.88,
|
||||
excerpt="embedding_backend = 'fastembed'",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"embedding backend fastembed local litellm api config",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/codexlens/config.py"
|
||||
assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"]
|
||||
assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(5.0)
|
||||
assert penalized[0].metadata["path_boost_overlap_tokens"] == ["config"]
|
||||
|
||||
def test_apply_path_penalties_more_aggressively_demotes_tests_for_explicit_feature_queries(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/tests/smart-search-intent.test.js",
|
||||
score=1.0,
|
||||
excerpt="smart search intent coverage",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.58,
|
||||
excerpt="plain-text JSON fallback for smart search",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.55)
|
||||
|
||||
def test_apply_path_penalties_demotes_generated_artifacts(self):
|
||||
results = [
|
||||
_make_result(path="/repo/dist/auth.js", score=10.0),
|
||||
_make_result(path="/repo/src/auth.ts", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"authenticate user",
|
||||
generated_file_penalty=0.35,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/auth.ts"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["generated_artifact"]
|
||||
|
||||
def test_apply_path_penalties_more_aggressively_demotes_generated_artifacts_for_explicit_feature_queries(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/dist/tools/smart-search.js",
|
||||
score=1.0,
|
||||
excerpt="built smart search output",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.45,
|
||||
excerpt="plain-text JSON fallback for smart search",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
generated_file_penalty=0.35,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["generated_artifact"]
|
||||
assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.4)
|
||||
|
||||
def test_apply_path_penalties_demotes_auxiliary_reference_files(self):
|
||||
results = [
|
||||
_make_result(path="/repo/examples/simple_search_comparison.py", score=10.0),
|
||||
_make_result(path="/repo/src/search/router.py", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"how does smart search route keyword queries",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/search/router.py"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["auxiliary_file"]
|
||||
|
||||
def test_apply_path_penalties_more_aggressively_demotes_auxiliary_files_for_explicit_feature_queries(self):
|
||||
results = [
|
||||
_make_result(
|
||||
path="/repo/benchmarks/smart_search_demo.py",
|
||||
score=1.0,
|
||||
excerpt="demo for smart search fallback",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.52,
|
||||
excerpt="plain-text JSON fallback for smart search",
|
||||
),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert penalized[1].metadata["path_penalty_reasons"] == ["auxiliary_file"]
|
||||
assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.5)
|
||||
|
||||
def test_apply_path_penalties_skips_when_query_targets_tests(self):
|
||||
results = [
|
||||
_make_result(path="/repo/tests/test_auth.py", score=10.0),
|
||||
_make_result(path="/repo/src/auth.py", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"auth tests",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/tests/test_auth.py"
|
||||
|
||||
def test_apply_path_penalties_skips_generated_penalty_when_query_targets_artifacts(self):
|
||||
results = [
|
||||
_make_result(path="/repo/dist/auth.js", score=10.0),
|
||||
_make_result(path="/repo/src/auth.ts", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"dist auth bundle",
|
||||
generated_file_penalty=0.35,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/dist/auth.js"
|
||||
|
||||
def test_rebalance_noisy_results_pushes_explicit_feature_query_noise_behind_source_files(self):
|
||||
results = [
|
||||
_make_result(path="/repo/src/tools/smart-search.ts", score=0.9),
|
||||
_make_result(path="/repo/tests/smart-search-intent.test.tsx", score=0.8),
|
||||
_make_result(path="/repo/src/core/cli-routes.ts", score=0.7),
|
||||
_make_result(path="/repo/dist/tools/smart-search.js", score=0.6),
|
||||
_make_result(path="/repo/benchmarks/smart_search_demo.py", score=0.5),
|
||||
]
|
||||
|
||||
rebalanced = rebalance_noisy_results(
|
||||
results,
|
||||
"parse CodexLens JSON output strip ANSI smart_search",
|
||||
)
|
||||
|
||||
assert [item.path for item in rebalanced[:2]] == [
|
||||
"/repo/src/tools/smart-search.ts",
|
||||
"/repo/src/core/cli-routes.ts",
|
||||
]
|
||||
|
||||
def test_rebalance_noisy_results_preserves_tests_when_query_targets_them(self):
|
||||
results = [
|
||||
_make_result(path="/repo/tests/smart-search-intent.test.tsx", score=0.9),
|
||||
_make_result(path="/repo/src/tools/smart-search.ts", score=0.8),
|
||||
]
|
||||
|
||||
rebalanced = rebalance_noisy_results(results, "smart search tests")
|
||||
|
||||
assert [item.path for item in rebalanced] == [
|
||||
"/repo/tests/smart-search-intent.test.tsx",
|
||||
"/repo/src/tools/smart-search.ts",
|
||||
]
|
||||
|
||||
def test_apply_path_penalties_skips_auxiliary_penalty_when_query_targets_examples(self):
|
||||
results = [
|
||||
_make_result(path="/repo/examples/simple_search_comparison.py", score=10.0),
|
||||
_make_result(path="/repo/src/search/router.py", score=9.0),
|
||||
]
|
||||
|
||||
penalized = apply_path_penalties(
|
||||
results,
|
||||
"smart search examples",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert penalized[0].path == "/repo/examples/simple_search_comparison.py"
|
||||
|
||||
|
||||
class TestCrossEncoderRerank:
|
||||
"""Tests for cross-encoder reranking edge cases."""
|
||||
|
||||
def test_cross_encoder_rerank_preserves_strong_source_candidates_for_semantic_feature_queries(self):
|
||||
class DummyReranker:
|
||||
def score_pairs(self, pairs, batch_size=32):
|
||||
_ = (pairs, batch_size)
|
||||
return [0.8323705792427063, 1.2463066923373844e-05]
|
||||
|
||||
reranked = cross_encoder_rerank(
|
||||
"how does smart search route keyword queries",
|
||||
[
|
||||
_make_result(
|
||||
path="/repo/tests/smart-search-intent.test.js",
|
||||
score=0.5989155769348145,
|
||||
excerpt="describes how smart search routes keyword queries",
|
||||
),
|
||||
_make_result(
|
||||
path="/repo/src/tools/smart-search.ts",
|
||||
score=0.554444432258606,
|
||||
excerpt="smart search keyword routing logic",
|
||||
),
|
||||
],
|
||||
DummyReranker(),
|
||||
top_k=2,
|
||||
)
|
||||
reranked = apply_path_penalties(
|
||||
reranked,
|
||||
"how does smart search route keyword queries",
|
||||
test_file_penalty=0.15,
|
||||
)
|
||||
|
||||
assert reranked[0].path == "/repo/src/tools/smart-search.ts"
|
||||
assert reranked[0].metadata["cross_encoder_floor_reason"] == "semantic_source_path_overlap"
|
||||
assert reranked[0].metadata["cross_encoder_floor_overlap_tokens"] == ["smart", "search"]
|
||||
assert reranked[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"]
|
||||
assert reranked[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
|
||||
# =============================================================================
|
||||
# Tests: get_rrf_weights
|
||||
# =============================================================================
|
||||
|
||||
@@ -67,3 +67,60 @@ def test_find_nearest_index(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) ->
|
||||
assert found is not None
|
||||
assert found.id == mapping.id
|
||||
|
||||
|
||||
def test_find_descendant_project_roots_returns_nested_project_roots(tmp_path: Path) -> None:
|
||||
db_path = tmp_path / "registry.db"
|
||||
workspace_root = tmp_path / "workspace"
|
||||
child_a = workspace_root / "packages" / "app-a"
|
||||
child_b = workspace_root / "tools" / "app-b"
|
||||
outside_root = tmp_path / "external"
|
||||
|
||||
with RegistryStore(db_path=db_path) as store:
|
||||
workspace_project = store.register_project(
|
||||
workspace_root,
|
||||
tmp_path / "indexes" / "workspace",
|
||||
)
|
||||
child_a_project = store.register_project(
|
||||
child_a,
|
||||
tmp_path / "indexes" / "workspace" / "packages" / "app-a",
|
||||
)
|
||||
child_b_project = store.register_project(
|
||||
child_b,
|
||||
tmp_path / "indexes" / "workspace" / "tools" / "app-b",
|
||||
)
|
||||
outside_project = store.register_project(
|
||||
outside_root,
|
||||
tmp_path / "indexes" / "external",
|
||||
)
|
||||
|
||||
store.register_dir(
|
||||
workspace_project.id,
|
||||
workspace_root,
|
||||
tmp_path / "indexes" / "workspace" / "_index.db",
|
||||
depth=0,
|
||||
)
|
||||
child_a_mapping = store.register_dir(
|
||||
child_a_project.id,
|
||||
child_a,
|
||||
tmp_path / "indexes" / "workspace" / "packages" / "app-a" / "_index.db",
|
||||
depth=0,
|
||||
)
|
||||
child_b_mapping = store.register_dir(
|
||||
child_b_project.id,
|
||||
child_b,
|
||||
tmp_path / "indexes" / "workspace" / "tools" / "app-b" / "_index.db",
|
||||
depth=0,
|
||||
)
|
||||
store.register_dir(
|
||||
outside_project.id,
|
||||
outside_root,
|
||||
tmp_path / "indexes" / "external" / "_index.db",
|
||||
depth=0,
|
||||
)
|
||||
|
||||
descendants = store.find_descendant_project_roots(workspace_root)
|
||||
|
||||
assert [mapping.index_path for mapping in descendants] == [
|
||||
child_a_mapping.index_path,
|
||||
child_b_mapping.index_path,
|
||||
]
|
||||
|
||||
@@ -313,3 +313,89 @@ def test_onnx_reranker_scores_pairs_with_sigmoid_normalization(
|
||||
|
||||
expected = [1.0 / (1.0 + math.exp(-float(i))) for i in range(len(pairs))]
|
||||
assert scores == pytest.approx(expected, rel=1e-6, abs=1e-6)
|
||||
|
||||
|
||||
def test_onnx_reranker_splits_tuple_providers_into_provider_options(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
import numpy as np
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
dummy_onnxruntime = types.ModuleType("onnxruntime")
|
||||
|
||||
dummy_optimum = types.ModuleType("optimum")
|
||||
dummy_optimum.__path__ = []
|
||||
dummy_optimum_ort = types.ModuleType("optimum.onnxruntime")
|
||||
|
||||
class DummyModelOutput:
|
||||
def __init__(self, logits: np.ndarray) -> None:
|
||||
self.logits = logits
|
||||
|
||||
class DummyModel:
|
||||
input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __call__(self, **inputs):
|
||||
batch = int(inputs["input_ids"].shape[0])
|
||||
return DummyModelOutput(logits=np.zeros((batch, 1), dtype=np.float32))
|
||||
|
||||
class DummyORTModelForSequenceClassification:
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
model_name: str,
|
||||
providers=None,
|
||||
provider_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
captured["model_name"] = model_name
|
||||
captured["providers"] = providers
|
||||
captured["provider_options"] = provider_options
|
||||
captured["kwargs"] = kwargs
|
||||
return DummyModel()
|
||||
|
||||
dummy_optimum_ort.ORTModelForSequenceClassification = DummyORTModelForSequenceClassification
|
||||
|
||||
dummy_transformers = types.ModuleType("transformers")
|
||||
|
||||
class DummyAutoTokenizer:
|
||||
model_max_length = 512
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, model_name: str, **kwargs):
|
||||
_ = model_name, kwargs
|
||||
return cls()
|
||||
|
||||
def __call__(self, *, text, text_pair, return_tensors, **kwargs):
|
||||
_ = text_pair, kwargs
|
||||
assert return_tensors == "np"
|
||||
batch = len(text)
|
||||
return {
|
||||
"input_ids": np.zeros((batch, 4), dtype=np.int64),
|
||||
"attention_mask": np.ones((batch, 4), dtype=np.int64),
|
||||
}
|
||||
|
||||
dummy_transformers.AutoTokenizer = DummyAutoTokenizer
|
||||
|
||||
monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime)
|
||||
monkeypatch.setitem(sys.modules, "optimum", dummy_optimum)
|
||||
monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort)
|
||||
monkeypatch.setitem(sys.modules, "transformers", dummy_transformers)
|
||||
|
||||
reranker = get_reranker(
|
||||
backend="onnx",
|
||||
model_name="dummy-model",
|
||||
use_gpu=True,
|
||||
providers=[
|
||||
("DmlExecutionProvider", {"device_id": 1}),
|
||||
"CPUExecutionProvider",
|
||||
],
|
||||
)
|
||||
assert isinstance(reranker, ONNXReranker)
|
||||
|
||||
scores = reranker.score_pairs([("q", "d")], batch_size=1)
|
||||
|
||||
assert scores == pytest.approx([0.5])
|
||||
assert captured["model_name"] == "dummy-model"
|
||||
assert captured["providers"] == ["DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
assert captured["provider_options"] == [{"device_id": 1}, {}]
|
||||
|
||||
@@ -428,6 +428,51 @@ class TestIndexPathCollection:
|
||||
assert len(paths) == 1
|
||||
engine.close()
|
||||
|
||||
def test_collect_skips_ignored_artifact_indexes(self, mock_registry, mock_mapper, temp_dir):
|
||||
"""Test collection skips dist/build-style artifact subtrees."""
|
||||
root_dir = temp_dir / "project"
|
||||
root_dir.mkdir()
|
||||
|
||||
root_db = root_dir / "_index.db"
|
||||
root_store = DirIndexStore(root_db)
|
||||
root_store.initialize()
|
||||
|
||||
src_dir = root_dir / "src"
|
||||
src_dir.mkdir()
|
||||
src_db = src_dir / "_index.db"
|
||||
src_store = DirIndexStore(src_db)
|
||||
src_store.initialize()
|
||||
|
||||
dist_dir = root_dir / "dist"
|
||||
dist_dir.mkdir()
|
||||
dist_db = dist_dir / "_index.db"
|
||||
dist_store = DirIndexStore(dist_db)
|
||||
dist_store.initialize()
|
||||
|
||||
workflow_dir = root_dir / ".workflow"
|
||||
workflow_dir.mkdir()
|
||||
workflow_db = workflow_dir / "_index.db"
|
||||
workflow_store = DirIndexStore(workflow_db)
|
||||
workflow_store.initialize()
|
||||
|
||||
root_store.register_subdir(name="src", index_path=src_db)
|
||||
root_store.register_subdir(name="dist", index_path=dist_db)
|
||||
root_store.register_subdir(name=".workflow", index_path=workflow_db)
|
||||
|
||||
root_store.close()
|
||||
src_store.close()
|
||||
dist_store.close()
|
||||
workflow_store.close()
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
paths = engine._collect_index_paths(root_db, depth=-1)
|
||||
|
||||
assert {path.relative_to(root_dir).as_posix() for path in paths} == {
|
||||
"_index.db",
|
||||
"src/_index.db",
|
||||
}
|
||||
engine.close()
|
||||
|
||||
|
||||
class TestResultMergeAndRank:
|
||||
"""Tests for _merge_and_rank method."""
|
||||
@@ -490,6 +535,36 @@ class TestResultMergeAndRank:
|
||||
assert merged == []
|
||||
engine.close()
|
||||
|
||||
def test_merge_applies_test_file_penalty_for_non_test_query(self, mock_registry, mock_mapper):
|
||||
"""Non-test queries should lightly demote test files during merge."""
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
|
||||
results = [
|
||||
SearchResult(path="/repo/tests/test_auth.py", score=10.0, excerpt="match 1"),
|
||||
SearchResult(path="/repo/src/auth.py", score=9.0, excerpt="match 2"),
|
||||
]
|
||||
|
||||
merged = engine._merge_and_rank(results, limit=10, query="authenticate users")
|
||||
|
||||
assert merged[0].path == "/repo/src/auth.py"
|
||||
assert merged[1].metadata["path_penalty_reasons"] == ["test_file"]
|
||||
engine.close()
|
||||
|
||||
def test_merge_applies_generated_file_penalty_for_non_artifact_query(self, mock_registry, mock_mapper):
|
||||
"""Non-artifact queries should lightly demote generated/build results during merge."""
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
|
||||
results = [
|
||||
SearchResult(path="/repo/dist/auth.js", score=10.0, excerpt="match 1"),
|
||||
SearchResult(path="/repo/src/auth.ts", score=9.0, excerpt="match 2"),
|
||||
]
|
||||
|
||||
merged = engine._merge_and_rank(results, limit=10, query="authenticate users")
|
||||
|
||||
assert merged[0].path == "/repo/src/auth.ts"
|
||||
assert merged[1].metadata["path_penalty_reasons"] == ["generated_artifact"]
|
||||
engine.close()
|
||||
|
||||
|
||||
# === Hierarchical Chain Search Tests ===
|
||||
|
||||
|
||||
@@ -400,15 +400,20 @@ class TestStage4OptionalRerank:
|
||||
"""Tests for Stage 4: Optional cross-encoder reranking."""
|
||||
|
||||
def test_stage4_reranks_with_reranker(
|
||||
self, mock_registry, mock_mapper, mock_config
|
||||
self, mock_registry, mock_mapper, temp_paths
|
||||
):
|
||||
"""Test _stage4_optional_rerank uses _cross_encoder_rerank."""
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
|
||||
"""Test _stage4_optional_rerank overfetches before final trim."""
|
||||
config = Config(data_dir=temp_paths / "data")
|
||||
config.reranker_top_k = 4
|
||||
config.reranking_top_k = 4
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper, config=config)
|
||||
|
||||
results = [
|
||||
SearchResult(path="a.py", score=0.9, excerpt="a"),
|
||||
SearchResult(path="b.py", score=0.8, excerpt="b"),
|
||||
SearchResult(path="c.py", score=0.7, excerpt="c"),
|
||||
SearchResult(path="d.py", score=0.6, excerpt="d"),
|
||||
SearchResult(path="e.py", score=0.5, excerpt="e"),
|
||||
]
|
||||
|
||||
# Mock the _cross_encoder_rerank method that _stage4 calls
|
||||
@@ -416,12 +421,14 @@ class TestStage4OptionalRerank:
|
||||
mock_rerank.return_value = [
|
||||
SearchResult(path="c.py", score=0.95, excerpt="c"),
|
||||
SearchResult(path="a.py", score=0.85, excerpt="a"),
|
||||
SearchResult(path="d.py", score=0.83, excerpt="d"),
|
||||
SearchResult(path="e.py", score=0.81, excerpt="e"),
|
||||
]
|
||||
|
||||
reranked = engine._stage4_optional_rerank("query", results, k=2)
|
||||
|
||||
mock_rerank.assert_called_once_with("query", results, 2)
|
||||
assert len(reranked) <= 2
|
||||
mock_rerank.assert_called_once_with("query", results, 4)
|
||||
assert len(reranked) == 4
|
||||
# First result should be reranked winner
|
||||
assert reranked[0].path == "c.py"
|
||||
|
||||
@@ -633,6 +640,113 @@ class TestStagedCascadeIntegration:
|
||||
a_result = next(r for r in result.results if r.path == "a.py")
|
||||
assert a_result.score == 0.9
|
||||
|
||||
def test_staged_cascade_expands_stage3_target_for_rerank_budget(
|
||||
self, mock_registry, mock_mapper, temp_paths
|
||||
):
|
||||
"""Test staged cascade preserves enough Stage 3 reps for rerank budget."""
|
||||
config = Config(data_dir=temp_paths / "data")
|
||||
config.enable_staged_rerank = True
|
||||
config.reranker_top_k = 6
|
||||
config.reranking_top_k = 6
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper, config=config)
|
||||
expanded_results = [
|
||||
SearchResult(path=f"src/file-{index}.ts", score=1.0 - (index * 0.01), excerpt="x")
|
||||
for index in range(8)
|
||||
]
|
||||
|
||||
with patch.object(engine, "_find_start_index") as mock_find:
|
||||
mock_find.return_value = temp_paths / "index" / "_index.db"
|
||||
|
||||
with patch.object(engine, "_collect_index_paths") as mock_collect:
|
||||
mock_collect.return_value = [temp_paths / "index" / "_index.db"]
|
||||
|
||||
with patch.object(engine, "_stage1_binary_search") as mock_stage1:
|
||||
mock_stage1.return_value = (
|
||||
[SearchResult(path="seed.ts", score=0.9, excerpt="seed")],
|
||||
temp_paths / "index",
|
||||
)
|
||||
|
||||
with patch.object(engine, "_stage2_lsp_expand") as mock_stage2:
|
||||
mock_stage2.return_value = expanded_results
|
||||
|
||||
with patch.object(engine, "_stage3_cluster_prune") as mock_stage3:
|
||||
mock_stage3.return_value = expanded_results[:6]
|
||||
|
||||
with patch.object(engine, "_stage4_optional_rerank") as mock_stage4:
|
||||
mock_stage4.return_value = expanded_results[:2]
|
||||
|
||||
engine.staged_cascade_search(
|
||||
"query",
|
||||
temp_paths / "src",
|
||||
k=2,
|
||||
coarse_k=20,
|
||||
)
|
||||
|
||||
mock_stage3.assert_called_once_with(
|
||||
expanded_results,
|
||||
6,
|
||||
query="query",
|
||||
)
|
||||
|
||||
def test_staged_cascade_overfetches_rerank_before_final_trim(
|
||||
self, mock_registry, mock_mapper, temp_paths
|
||||
):
|
||||
"""Test staged rerank keeps enough candidates for path penalties to work."""
|
||||
config = Config(data_dir=temp_paths / "data")
|
||||
config.enable_staged_rerank = True
|
||||
config.reranker_top_k = 4
|
||||
config.reranking_top_k = 4
|
||||
config.test_file_penalty = 0.15
|
||||
config.generated_file_penalty = 0.35
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper, config=config)
|
||||
|
||||
src_primary = str(temp_paths / "src" / "tools" / "smart-search.ts")
|
||||
src_secondary = str(temp_paths / "src" / "tools" / "codex-lens.ts")
|
||||
test_primary = str(temp_paths / "tests" / "integration" / "cli-routes.test.ts")
|
||||
test_secondary = str(
|
||||
temp_paths / "frontend" / "tests" / "e2e" / "prompt-memory.spec.ts"
|
||||
)
|
||||
query = "parse CodexLens JSON output strip ANSI smart_search"
|
||||
clustered_results = [
|
||||
SearchResult(path=test_primary, score=0.98, excerpt="test"),
|
||||
SearchResult(path=test_secondary, score=0.97, excerpt="test"),
|
||||
SearchResult(path=src_primary, score=0.96, excerpt="source"),
|
||||
SearchResult(path=src_secondary, score=0.95, excerpt="source"),
|
||||
]
|
||||
|
||||
with patch.object(engine, "_find_start_index") as mock_find:
|
||||
mock_find.return_value = temp_paths / "index" / "_index.db"
|
||||
|
||||
with patch.object(engine, "_collect_index_paths") as mock_collect:
|
||||
mock_collect.return_value = [temp_paths / "index" / "_index.db"]
|
||||
|
||||
with patch.object(engine, "_stage1_binary_search") as mock_stage1:
|
||||
mock_stage1.return_value = (
|
||||
[SearchResult(path=src_primary, score=0.9, excerpt="seed")],
|
||||
temp_paths / "index",
|
||||
)
|
||||
|
||||
with patch.object(engine, "_stage2_lsp_expand") as mock_stage2:
|
||||
mock_stage2.return_value = clustered_results
|
||||
|
||||
with patch.object(engine, "_stage3_cluster_prune") as mock_stage3:
|
||||
mock_stage3.return_value = clustered_results
|
||||
|
||||
with patch.object(engine, "_cross_encoder_rerank") as mock_rerank:
|
||||
mock_rerank.return_value = clustered_results
|
||||
|
||||
result = engine.staged_cascade_search(
|
||||
query,
|
||||
temp_paths / "src",
|
||||
k=2,
|
||||
coarse_k=20,
|
||||
)
|
||||
|
||||
mock_rerank.assert_called_once_with(query, clustered_results, 4)
|
||||
assert [item.path for item in result.results] == [src_primary, src_secondary]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Graceful Degradation Tests
|
||||
|
||||
Reference in New Issue
Block a user