feat: unified task.json schema migration and multi-module updates

- Create task-schema.json (JSON Schema draft-07) with 10 field blocks fusing
  Unified JSONL, 6-field Task JSON, and Solution Schema advantages
- Migrate unified-execute-with-file from JSONL to .task/*.json directory scanning
- Migrate 3 producers (lite-plan, plan-converter, collaborative-plan) to
  .task/*.json multi-file output
- Add review-cycle Phase 7.5 export-to-tasks (FIX-*.json) and issue-resolve
  --export-tasks option
- Add schema compatibility annotations to action-planning-agent, workflow-plan,
  and tdd-plan
- Add spec-generator skill phases and templates
- Add memory v2 pipeline (consolidation, extraction, job scheduler, embedder)
- Add secret-redactor utility and core-memory enhancements
- Add codex-lens accuracy benchmarks and staged env config overrides
This commit is contained in:
catlog22
2026-02-11 17:40:56 +08:00
parent 7aa1038951
commit 99ee4e7d36
36 changed files with 7823 additions and 315 deletions

View File

@@ -0,0 +1,33 @@
{"query":"class StandaloneLspManager","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"def _open_document","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"def _read_message","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"how does textDocument/didOpen work","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"class LspBridge","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
{"query":"def get_document_symbols","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
{"query":"class KeepAliveLspBridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
{"query":"LSP keepalive bridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
{"query":"class LspGraphBuilder","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
{"query":"def build_from_seeds","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
{"query":"def _stage2_realtime_lsp_expand","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _stage3_cluster_prune","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _cross_encoder_rerank","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def dense_rerank_cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _find_nearest_binary_mmap_root","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"class BinarySearcher","relevant_paths":["codexlens/search/binary_searcher.py"]}
{"query":"class GraphExpander","relevant_paths":["codexlens/search/graph_expander.py"]}
{"query":"def cross_encoder_rerank","relevant_paths":["codexlens/search/ranking.py"]}
{"query":"def group_similar_results","relevant_paths":["codexlens/search/ranking.py"]}
{"query":"class ConfigError","relevant_paths":["codexlens/errors.py"]}
{"query":"def load_settings","relevant_paths":["codexlens/config.py"]}
{"query":"BINARY_VECTORS_MMAP_NAME","relevant_paths":["codexlens/config.py"]}
{"query":"STAGED_CLUSTERING_STRATEGY","relevant_paths":["codexlens/config.py","codexlens/env_config.py"]}
{"query":"def apply_workspace_env","relevant_paths":["codexlens/env_config.py"]}
{"query":"def generate_env_example","relevant_paths":["codexlens/env_config.py"]}
{"query":"def get_reranker","relevant_paths":["codexlens/semantic/reranker/factory.py"]}
{"query":"class APIReranker","relevant_paths":["codexlens/semantic/reranker/api_reranker.py"]}
{"query":"class RegistryStore","relevant_paths":["codexlens/storage/registry.py"]}
{"query":"class PathMapper","relevant_paths":["codexlens/storage/path_mapper.py"]}
{"query":"def lsp_status","relevant_paths":["codexlens/cli/commands.py"]}
{"query":"graph_neighbors migration","relevant_paths":["codexlens/storage/migrations/migration_007_add_graph_neighbors.py"]}
{"query":"def get_model_config","relevant_paths":["codexlens/semantic/vector_store.py"]}

View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python
"""Compare labeled accuracy: staged(realtime LSP graph) vs dense_rerank.
This script measures retrieval "accuracy" against a labeled query set.
Each query must provide a list of relevant file paths (relative to --source
or absolute). We report:
- Hit@K (any relevant file appears in top-K)
- MRR@K (reciprocal rank of first relevant file within top-K)
- Recall@K (fraction of relevant files present in top-K)
Example:
python benchmarks/compare_accuracy_labeled.py --source ./src
python benchmarks/compare_accuracy_labeled.py --queries-file benchmarks/accuracy_queries_codexlens.jsonl
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
# Add src to path (match other benchmark scripts)
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_codexlens.jsonl"
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _normalize_path_key(path: str) -> str:
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
try:
p = Path(path)
# Don't explode on non-files like "<memory>".
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
norm = str(p.resolve())
else:
norm = str(p)
except Exception:
norm = path
norm = norm.replace("/", "\\")
if os.name == "nt":
norm = norm.lower()
return norm
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
if not path.is_file():
raise SystemExit(f"Queries file does not exist: {path}")
out: List[Dict[str, Any]] = []
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
try:
item = json.loads(line)
except Exception as exc:
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
if not isinstance(item, dict) or "query" not in item:
raise SystemExit(f"Invalid query item (expected object with 'query'): {item!r}")
out.append(item)
if limit is not None and len(out) >= limit:
break
return out
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
out: List[str] = []
seen: set[str] = set()
for p in paths:
if p in seen:
continue
seen.add(p)
out.append(p)
if len(out) >= k:
break
return out
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
for i, p in enumerate(topk_paths, start=1):
if p in relevant:
return i
return None
@dataclass
class StrategyRun:
strategy: str
latency_ms: float
topk_paths: List[str]
first_hit_rank: Optional[int]
hit_at_k: bool
recall_at_k: float
error: Optional[str] = None
@dataclass
class QueryEval:
query: str
relevant_paths: List[str]
staged: StrategyRun
dense_rerank: StrategyRun
def _run_strategy(
engine: ChainSearchEngine,
*,
strategy: str,
query: str,
source_path: Path,
k: int,
coarse_k: int,
relevant: set[str],
options: Optional[SearchOptions] = None,
) -> StrategyRun:
gc.collect()
start_ms = _now_ms()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy=strategy,
)
latency_ms = _now_ms() - start_ms
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
paths_norm = [_normalize_path_key(p) for p in paths_raw]
topk = _dedup_topk(paths_norm, k=k)
rank = _first_hit_rank(topk, relevant)
hit = rank is not None
recall = 0.0
if relevant:
recall = len(set(topk) & relevant) / float(len(relevant))
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=topk,
first_hit_rank=rank,
hit_at_k=hit,
recall_at_k=recall,
error=None,
)
except Exception as exc:
latency_ms = _now_ms() - start_ms
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=[],
first_hit_rank=None,
hit_at_k=False,
recall_at_k=0.0,
error=repr(exc),
)
def _mrr(ranks: Sequence[Optional[int]]) -> float:
vals = []
for r in ranks:
if r is None or r <= 0:
vals.append(0.0)
else:
vals.append(1.0 / float(r))
return statistics.mean(vals) if vals else 0.0
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare labeled retrieval accuracy: staged(realtime) vs dense_rerank"
)
parser.add_argument(
"--source",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries-file",
type=Path,
default=DEFAULT_QUERIES_FILE,
help="JSONL file with {query, relevant_paths[]} per line",
)
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
parser.add_argument("--k", type=int, default=10, help="Top-K for evaluation (default 10)")
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
parser.add_argument(
"--staged-cluster-strategy",
type=str,
default="path",
help="Config.staged_clustering_strategy override for staged (default: path)",
)
parser.add_argument(
"--stage2-mode",
type=str,
default="realtime",
help="Config.staged_stage2_mode override for staged (default: realtime)",
)
parser.add_argument(
"--output",
type=Path,
default=Path(__file__).parent / "results" / "accuracy_labeled.json",
help="Output JSON path",
)
args = parser.parse_args()
if not args.source.exists():
raise SystemExit(f"Source path does not exist: {args.source}")
labeled = _load_labeled_queries(args.queries_file, args.queries)
if not labeled:
raise SystemExit("No queries to run")
source_root = args.source.expanduser().resolve()
# Match CLI behavior: load settings + apply global/workspace .env overrides.
config = Config.load()
config.cascade_strategy = "staged"
config.staged_stage2_mode = str(args.stage2_mode or "realtime").strip().lower()
config.enable_staged_rerank = True
config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower()
# Stability: on some Windows setups, DirectML/ONNX can crash under load.
config.embedding_use_gpu = False
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
def resolve_expected(paths: Sequence[str]) -> set[str]:
out: set[str] = set()
for p in paths:
try:
cand = Path(p)
if not cand.is_absolute():
cand = (source_root / cand).resolve()
out.add(_normalize_path_key(str(cand)))
except Exception:
out.add(_normalize_path_key(p))
return out
evaluations: List[QueryEval] = []
try:
for i, item in enumerate(labeled, start=1):
query = str(item.get("query", "")).strip()
relevant_raw = item.get("relevant_paths") or []
if not query:
continue
if not isinstance(relevant_raw, list) or not relevant_raw:
raise SystemExit(f"Query item missing relevant_paths[]: {item!r}")
relevant = resolve_expected([str(p) for p in relevant_raw])
print(f"[{i}/{len(labeled)}] {query}")
staged = _run_strategy(
engine,
strategy="staged",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
dense = _run_strategy(
engine,
strategy="dense_rerank",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
evaluations.append(
QueryEval(
query=query,
relevant_paths=[_normalize_path_key(str((source_root / p).resolve())) if not Path(p).is_absolute() else _normalize_path_key(p) for p in relevant_raw],
staged=staged,
dense_rerank=dense,
)
)
finally:
try:
engine.close()
except Exception:
pass
try:
registry.close()
except Exception:
pass
staged_runs = [e.staged for e in evaluations]
dense_runs = [e.dense_rerank for e in evaluations]
def mean(xs: Sequence[float]) -> float:
return statistics.mean(xs) if xs else 0.0
staged_ranks = [r.first_hit_rank for r in staged_runs]
dense_ranks = [r.first_hit_rank for r in dense_runs]
summary = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(evaluations),
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"staged": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in staged_runs]),
"mrr_at_k": _mrr(staged_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in staged_runs]),
"avg_latency_ms": mean([r.latency_ms for r in staged_runs if not r.error]),
"errors": sum(1 for r in staged_runs if r.error),
},
"dense_rerank": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in dense_runs]),
"mrr_at_k": _mrr(dense_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in dense_runs]),
"avg_latency_ms": mean([r.latency_ms for r in dense_runs if not r.error]),
"errors": sum(1 for r in dense_runs if r.error),
},
"config": {
"staged_stage2_mode": config.staged_stage2_mode,
"staged_clustering_strategy": config.staged_clustering_strategy,
"enable_staged_rerank": bool(config.enable_staged_rerank),
"reranker_backend": config.reranker_backend,
"reranker_model": config.reranker_model,
"embedding_backend": config.embedding_backend,
"embedding_model": config.embedding_model,
},
}
payload = {"summary": summary, "evaluations": [asdict(e) for e in evaluations]}
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print("\n=== SUMMARY ===")
print(json.dumps(summary, indent=2))
print(f"\nSaved: {args.output}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -315,119 +315,117 @@ class Config:
def load_settings(self) -> None:
"""Load settings from file if exists."""
if not self.settings_path.exists():
return
if self.settings_path.exists():
try:
with open(self.settings_path, "r", encoding="utf-8") as f:
settings = json.load(f)
try:
with open(self.settings_path, "r", encoding="utf-8") as f:
settings = json.load(f)
# Load embedding settings
embedding = settings.get("embedding", {})
if "backend" in embedding:
backend = embedding["backend"]
# Support 'api' as alias for 'litellm'
if backend == "api":
backend = "litellm"
if backend in {"fastembed", "litellm"}:
self.embedding_backend = backend
else:
log.warning(
"Invalid embedding backend in %s: %r (expected 'fastembed' or 'litellm')",
self.settings_path,
embedding["backend"],
)
if "model" in embedding:
self.embedding_model = embedding["model"]
if "use_gpu" in embedding:
self.embedding_use_gpu = embedding["use_gpu"]
# Load embedding settings
embedding = settings.get("embedding", {})
if "backend" in embedding:
backend = embedding["backend"]
# Support 'api' as alias for 'litellm'
if backend == "api":
backend = "litellm"
if backend in {"fastembed", "litellm"}:
self.embedding_backend = backend
else:
log.warning(
"Invalid embedding backend in %s: %r (expected 'fastembed' or 'litellm')",
self.settings_path,
embedding["backend"],
)
if "model" in embedding:
self.embedding_model = embedding["model"]
if "use_gpu" in embedding:
self.embedding_use_gpu = embedding["use_gpu"]
# Load multi-endpoint configuration
if "endpoints" in embedding:
self.embedding_endpoints = embedding["endpoints"]
if "pool_enabled" in embedding:
self.embedding_pool_enabled = embedding["pool_enabled"]
if "strategy" in embedding:
self.embedding_strategy = embedding["strategy"]
if "cooldown" in embedding:
self.embedding_cooldown = embedding["cooldown"]
# Load multi-endpoint configuration
if "endpoints" in embedding:
self.embedding_endpoints = embedding["endpoints"]
if "pool_enabled" in embedding:
self.embedding_pool_enabled = embedding["pool_enabled"]
if "strategy" in embedding:
self.embedding_strategy = embedding["strategy"]
if "cooldown" in embedding:
self.embedding_cooldown = embedding["cooldown"]
# Load LLM settings
llm = settings.get("llm", {})
if "enabled" in llm:
self.llm_enabled = llm["enabled"]
if "tool" in llm:
self.llm_tool = llm["tool"]
if "timeout_ms" in llm:
self.llm_timeout_ms = llm["timeout_ms"]
if "batch_size" in llm:
self.llm_batch_size = llm["batch_size"]
# Load LLM settings
llm = settings.get("llm", {})
if "enabled" in llm:
self.llm_enabled = llm["enabled"]
if "tool" in llm:
self.llm_tool = llm["tool"]
if "timeout_ms" in llm:
self.llm_timeout_ms = llm["timeout_ms"]
if "batch_size" in llm:
self.llm_batch_size = llm["batch_size"]
# Load reranker settings
reranker = settings.get("reranker", {})
if "enabled" in reranker:
self.enable_cross_encoder_rerank = reranker["enabled"]
if "backend" in reranker:
backend = reranker["backend"]
if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}:
self.reranker_backend = backend
else:
log.warning(
"Invalid reranker backend in %s: %r (expected 'fastembed', 'onnx', 'api', 'litellm', or 'legacy')",
self.settings_path,
backend,
)
if "model" in reranker:
self.reranker_model = reranker["model"]
if "top_k" in reranker:
self.reranker_top_k = reranker["top_k"]
if "max_input_tokens" in reranker:
self.reranker_max_input_tokens = reranker["max_input_tokens"]
if "pool_enabled" in reranker:
self.reranker_pool_enabled = reranker["pool_enabled"]
if "strategy" in reranker:
self.reranker_strategy = reranker["strategy"]
if "cooldown" in reranker:
self.reranker_cooldown = reranker["cooldown"]
# Load reranker settings
reranker = settings.get("reranker", {})
if "enabled" in reranker:
self.enable_cross_encoder_rerank = reranker["enabled"]
if "backend" in reranker:
backend = reranker["backend"]
if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}:
self.reranker_backend = backend
else:
log.warning(
"Invalid reranker backend in %s: %r (expected 'fastembed', 'onnx', 'api', 'litellm', or 'legacy')",
self.settings_path,
backend,
)
if "model" in reranker:
self.reranker_model = reranker["model"]
if "top_k" in reranker:
self.reranker_top_k = reranker["top_k"]
if "max_input_tokens" in reranker:
self.reranker_max_input_tokens = reranker["max_input_tokens"]
if "pool_enabled" in reranker:
self.reranker_pool_enabled = reranker["pool_enabled"]
if "strategy" in reranker:
self.reranker_strategy = reranker["strategy"]
if "cooldown" in reranker:
self.reranker_cooldown = reranker["cooldown"]
# Load cascade settings
cascade = settings.get("cascade", {})
if "strategy" in cascade:
strategy = cascade["strategy"]
if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}:
self.cascade_strategy = strategy
else:
log.warning(
"Invalid cascade strategy in %s: %r (expected 'binary', 'binary_rerank', 'dense_rerank', or 'staged')",
self.settings_path,
strategy,
)
if "coarse_k" in cascade:
self.cascade_coarse_k = cascade["coarse_k"]
if "fine_k" in cascade:
self.cascade_fine_k = cascade["fine_k"]
# Load cascade settings
cascade = settings.get("cascade", {})
if "strategy" in cascade:
strategy = cascade["strategy"]
if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}:
self.cascade_strategy = strategy
else:
log.warning(
"Invalid cascade strategy in %s: %r (expected 'binary', 'binary_rerank', 'dense_rerank', or 'staged')",
self.settings_path,
strategy,
)
if "coarse_k" in cascade:
self.cascade_coarse_k = cascade["coarse_k"]
if "fine_k" in cascade:
self.cascade_fine_k = cascade["fine_k"]
# Load API settings
api = settings.get("api", {})
if "max_workers" in api:
self.api_max_workers = api["max_workers"]
if "batch_size" in api:
self.api_batch_size = api["batch_size"]
if "batch_size_dynamic" in api:
self.api_batch_size_dynamic = api["batch_size_dynamic"]
if "batch_size_utilization_factor" in api:
self.api_batch_size_utilization_factor = api["batch_size_utilization_factor"]
if "batch_size_max" in api:
self.api_batch_size_max = api["batch_size_max"]
if "chars_per_token_estimate" in api:
self.chars_per_token_estimate = api["chars_per_token_estimate"]
except Exception as exc:
log.warning(
"Failed to load settings from %s (%s): %s",
self.settings_path,
type(exc).__name__,
exc,
)
# Load API settings
api = settings.get("api", {})
if "max_workers" in api:
self.api_max_workers = api["max_workers"]
if "batch_size" in api:
self.api_batch_size = api["batch_size"]
if "batch_size_dynamic" in api:
self.api_batch_size_dynamic = api["batch_size_dynamic"]
if "batch_size_utilization_factor" in api:
self.api_batch_size_utilization_factor = api["batch_size_utilization_factor"]
if "batch_size_max" in api:
self.api_batch_size_max = api["batch_size_max"]
if "chars_per_token_estimate" in api:
self.chars_per_token_estimate = api["chars_per_token_estimate"]
except Exception as exc:
log.warning(
"Failed to load settings from %s (%s): %s",
self.settings_path,
type(exc).__name__,
exc,
)
# Apply .env overrides (highest priority)
self._apply_env_overrides()
@@ -450,9 +448,9 @@ class Config:
RERANKER_STRATEGY: Load balance strategy for reranker
RERANKER_COOLDOWN: Rate limit cooldown for reranker
"""
from .env_config import load_global_env
from .env_config import load_env_file
env_vars = load_global_env()
env_vars = load_env_file(self.data_dir / ".env")
if not env_vars:
return
@@ -461,6 +459,43 @@ class Config:
# Check prefixed version first (Dashboard format), then unprefixed
return env_vars.get(f"CODEXLENS_{key}") or env_vars.get(key)
def _parse_bool(value: str) -> bool:
return value.strip().lower() in {"true", "1", "yes", "on"}
# Cascade overrides
cascade_enabled = get_env("ENABLE_CASCADE_SEARCH")
if cascade_enabled:
self.enable_cascade_search = _parse_bool(cascade_enabled)
log.debug(
"Overriding enable_cascade_search from .env: %s",
self.enable_cascade_search,
)
cascade_strategy = get_env("CASCADE_STRATEGY")
if cascade_strategy:
strategy = cascade_strategy.strip().lower()
if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}:
self.cascade_strategy = strategy
log.debug("Overriding cascade_strategy from .env: %s", self.cascade_strategy)
else:
log.warning("Invalid CASCADE_STRATEGY in .env: %r", cascade_strategy)
cascade_coarse_k = get_env("CASCADE_COARSE_K")
if cascade_coarse_k:
try:
self.cascade_coarse_k = int(cascade_coarse_k)
log.debug("Overriding cascade_coarse_k from .env: %s", self.cascade_coarse_k)
except ValueError:
log.warning("Invalid CASCADE_COARSE_K in .env: %r", cascade_coarse_k)
cascade_fine_k = get_env("CASCADE_FINE_K")
if cascade_fine_k:
try:
self.cascade_fine_k = int(cascade_fine_k)
log.debug("Overriding cascade_fine_k from .env: %s", self.cascade_fine_k)
except ValueError:
log.warning("Invalid CASCADE_FINE_K in .env: %r", cascade_fine_k)
# Embedding overrides
embedding_model = get_env("EMBEDDING_MODEL")
if embedding_model:
@@ -583,6 +618,136 @@ class Config:
self.chunk_strip_docstrings = strip_docstrings.lower() in ("true", "1", "yes")
log.debug("Overriding chunk_strip_docstrings from .env: %s", self.chunk_strip_docstrings)
# Staged cascade overrides
staged_stage2_mode = get_env("STAGED_STAGE2_MODE")
if staged_stage2_mode:
mode = staged_stage2_mode.strip().lower()
if mode in {"precomputed", "realtime"}:
self.staged_stage2_mode = mode
log.debug("Overriding staged_stage2_mode from .env: %s", self.staged_stage2_mode)
elif mode in {"live"}:
self.staged_stage2_mode = "realtime"
log.debug("Overriding staged_stage2_mode from .env: %s", self.staged_stage2_mode)
else:
log.warning("Invalid STAGED_STAGE2_MODE in .env: %r", staged_stage2_mode)
staged_clustering_strategy = get_env("STAGED_CLUSTERING_STRATEGY")
if staged_clustering_strategy:
strategy = staged_clustering_strategy.strip().lower()
if strategy in {"auto", "hdbscan", "dbscan", "frequency", "noop", "score", "dir_rr", "path"}:
self.staged_clustering_strategy = strategy
log.debug(
"Overriding staged_clustering_strategy from .env: %s",
self.staged_clustering_strategy,
)
elif strategy in {"none", "off"}:
self.staged_clustering_strategy = "noop"
log.debug(
"Overriding staged_clustering_strategy from .env: %s",
self.staged_clustering_strategy,
)
else:
log.warning(
"Invalid STAGED_CLUSTERING_STRATEGY in .env: %r",
staged_clustering_strategy,
)
staged_clustering_min_size = get_env("STAGED_CLUSTERING_MIN_SIZE")
if staged_clustering_min_size:
try:
self.staged_clustering_min_size = int(staged_clustering_min_size)
log.debug(
"Overriding staged_clustering_min_size from .env: %s",
self.staged_clustering_min_size,
)
except ValueError:
log.warning(
"Invalid STAGED_CLUSTERING_MIN_SIZE in .env: %r",
staged_clustering_min_size,
)
enable_staged_rerank = get_env("ENABLE_STAGED_RERANK")
if enable_staged_rerank:
self.enable_staged_rerank = _parse_bool(enable_staged_rerank)
log.debug("Overriding enable_staged_rerank from .env: %s", self.enable_staged_rerank)
rt_timeout = get_env("STAGED_REALTIME_LSP_TIMEOUT_S")
if rt_timeout:
try:
self.staged_realtime_lsp_timeout_s = float(rt_timeout)
log.debug(
"Overriding staged_realtime_lsp_timeout_s from .env: %s",
self.staged_realtime_lsp_timeout_s,
)
except ValueError:
log.warning("Invalid STAGED_REALTIME_LSP_TIMEOUT_S in .env: %r", rt_timeout)
rt_depth = get_env("STAGED_REALTIME_LSP_DEPTH")
if rt_depth:
try:
self.staged_realtime_lsp_depth = int(rt_depth)
log.debug(
"Overriding staged_realtime_lsp_depth from .env: %s",
self.staged_realtime_lsp_depth,
)
except ValueError:
log.warning("Invalid STAGED_REALTIME_LSP_DEPTH in .env: %r", rt_depth)
rt_max_nodes = get_env("STAGED_REALTIME_LSP_MAX_NODES")
if rt_max_nodes:
try:
self.staged_realtime_lsp_max_nodes = int(rt_max_nodes)
log.debug(
"Overriding staged_realtime_lsp_max_nodes from .env: %s",
self.staged_realtime_lsp_max_nodes,
)
except ValueError:
log.warning("Invalid STAGED_REALTIME_LSP_MAX_NODES in .env: %r", rt_max_nodes)
rt_max_seeds = get_env("STAGED_REALTIME_LSP_MAX_SEEDS")
if rt_max_seeds:
try:
self.staged_realtime_lsp_max_seeds = int(rt_max_seeds)
log.debug(
"Overriding staged_realtime_lsp_max_seeds from .env: %s",
self.staged_realtime_lsp_max_seeds,
)
except ValueError:
log.warning("Invalid STAGED_REALTIME_LSP_MAX_SEEDS in .env: %r", rt_max_seeds)
rt_max_concurrent = get_env("STAGED_REALTIME_LSP_MAX_CONCURRENT")
if rt_max_concurrent:
try:
self.staged_realtime_lsp_max_concurrent = int(rt_max_concurrent)
log.debug(
"Overriding staged_realtime_lsp_max_concurrent from .env: %s",
self.staged_realtime_lsp_max_concurrent,
)
except ValueError:
log.warning(
"Invalid STAGED_REALTIME_LSP_MAX_CONCURRENT in .env: %r",
rt_max_concurrent,
)
rt_warmup = get_env("STAGED_REALTIME_LSP_WARMUP_S")
if rt_warmup:
try:
self.staged_realtime_lsp_warmup_s = float(rt_warmup)
log.debug(
"Overriding staged_realtime_lsp_warmup_s from .env: %s",
self.staged_realtime_lsp_warmup_s,
)
except ValueError:
log.warning("Invalid STAGED_REALTIME_LSP_WARMUP_S in .env: %r", rt_warmup)
rt_resolve = get_env("STAGED_REALTIME_LSP_RESOLVE_SYMBOLS")
if rt_resolve:
self.staged_realtime_lsp_resolve_symbols = _parse_bool(rt_resolve)
log.debug(
"Overriding staged_realtime_lsp_resolve_symbols from .env: %s",
self.staged_realtime_lsp_resolve_symbols,
)
@classmethod
def load(cls) -> "Config":
"""Load config with settings from file."""

View File

@@ -45,6 +45,22 @@ ENV_VARS = {
# General configuration
"CODEXLENS_DATA_DIR": "Custom data directory path",
"CODEXLENS_DEBUG": "Enable debug mode (true/false)",
# Cascade / staged pipeline configuration
"ENABLE_CASCADE_SEARCH": "Enable cascade search (true/false)",
"CASCADE_STRATEGY": "Cascade strategy: binary, binary_rerank, dense_rerank, staged",
"CASCADE_COARSE_K": "Cascade coarse_k candidate count (int)",
"CASCADE_FINE_K": "Cascade fine_k result count (int)",
"STAGED_STAGE2_MODE": "Staged Stage 2 mode: precomputed, realtime",
"STAGED_CLUSTERING_STRATEGY": "Staged clustering strategy: auto, score, path, dir_rr, noop, ...",
"STAGED_CLUSTERING_MIN_SIZE": "Staged clustering min cluster size (int)",
"ENABLE_STAGED_RERANK": "Enable staged reranking in Stage 4 (true/false)",
"STAGED_REALTIME_LSP_TIMEOUT_S": "Realtime LSP expansion timeout budget (float seconds)",
"STAGED_REALTIME_LSP_DEPTH": "Realtime LSP BFS depth (int)",
"STAGED_REALTIME_LSP_MAX_NODES": "Realtime LSP max nodes (int)",
"STAGED_REALTIME_LSP_MAX_SEEDS": "Realtime LSP max seeds (int)",
"STAGED_REALTIME_LSP_MAX_CONCURRENT": "Realtime LSP max concurrent requests (int)",
"STAGED_REALTIME_LSP_WARMUP_S": "Realtime LSP warmup wait after didOpen (float seconds)",
"STAGED_REALTIME_LSP_RESOLVE_SYMBOLS": "Resolve symbols via documentSymbol in realtime expansion (true/false)",
# Chunking configuration
"CHUNK_STRIP_COMMENTS": "Strip comments from code chunks for embedding: true/false (default: true)",
"CHUNK_STRIP_DOCSTRINGS": "Strip docstrings from code chunks for embedding: true/false (default: true)",

View File

@@ -0,0 +1,117 @@
"""Unit tests for Config .env overrides for staged/cascade settings."""
from __future__ import annotations
import tempfile
from pathlib import Path
import pytest
from codexlens.config import Config
@pytest.fixture
def temp_config_dir() -> Path:
"""Create temporary directory for config data_dir."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
yield Path(tmpdir.name)
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
def test_staged_env_overrides_apply(temp_config_dir: Path) -> None:
config = Config(data_dir=temp_config_dir)
env_path = temp_config_dir / ".env"
env_path.write_text(
"\n".join(
[
"ENABLE_CASCADE_SEARCH=true",
"CASCADE_STRATEGY=staged",
"CASCADE_COARSE_K=111",
"CASCADE_FINE_K=7",
"STAGED_STAGE2_MODE=realtime",
"STAGED_CLUSTERING_STRATEGY=path",
"STAGED_CLUSTERING_MIN_SIZE=5",
"ENABLE_STAGED_RERANK=false",
"STAGED_REALTIME_LSP_TIMEOUT_S=12.5",
"STAGED_REALTIME_LSP_DEPTH=2",
"STAGED_REALTIME_LSP_MAX_NODES=123",
"STAGED_REALTIME_LSP_MAX_SEEDS=3",
"STAGED_REALTIME_LSP_MAX_CONCURRENT=4",
"STAGED_REALTIME_LSP_WARMUP_S=0.25",
"STAGED_REALTIME_LSP_RESOLVE_SYMBOLS=yes",
"",
]
),
encoding="utf-8",
)
config.load_settings()
assert config.enable_cascade_search is True
assert config.cascade_strategy == "staged"
assert config.cascade_coarse_k == 111
assert config.cascade_fine_k == 7
assert config.staged_stage2_mode == "realtime"
assert config.staged_clustering_strategy == "path"
assert config.staged_clustering_min_size == 5
assert config.enable_staged_rerank is False
assert config.staged_realtime_lsp_timeout_s == 12.5
assert config.staged_realtime_lsp_depth == 2
assert config.staged_realtime_lsp_max_nodes == 123
assert config.staged_realtime_lsp_max_seeds == 3
assert config.staged_realtime_lsp_max_concurrent == 4
assert config.staged_realtime_lsp_warmup_s == 0.25
assert config.staged_realtime_lsp_resolve_symbols is True
def test_staged_env_overrides_prefixed_wins(temp_config_dir: Path) -> None:
config = Config(data_dir=temp_config_dir)
env_path = temp_config_dir / ".env"
env_path.write_text(
"\n".join(
[
"STAGED_CLUSTERING_STRATEGY=score",
"CODEXLENS_STAGED_CLUSTERING_STRATEGY=path",
"STAGED_STAGE2_MODE=precomputed",
"CODEXLENS_STAGED_STAGE2_MODE=realtime",
"",
]
),
encoding="utf-8",
)
config.load_settings()
assert config.staged_clustering_strategy == "path"
assert config.staged_stage2_mode == "realtime"
def test_staged_env_overrides_invalid_ignored(temp_config_dir: Path) -> None:
config = Config(data_dir=temp_config_dir)
env_path = temp_config_dir / ".env"
env_path.write_text(
"\n".join(
[
"STAGED_STAGE2_MODE=bogus",
"STAGED_CLUSTERING_STRATEGY=embedding_remote",
"STAGED_REALTIME_LSP_TIMEOUT_S=nope",
"CASCADE_STRATEGY=???",
"",
]
),
encoding="utf-8",
)
config.load_settings()
assert config.cascade_strategy == "binary"
assert config.staged_stage2_mode == "precomputed"
assert config.staged_clustering_strategy == "auto"
assert config.staged_realtime_lsp_timeout_s == 30.0