Files
Claude-Code-Workflow/codex-lens/benchmarks/compare_accuracy_labeled.py
catlog22 99ee4e7d36 feat: unified task.json schema migration and multi-module updates
- Create task-schema.json (JSON Schema draft-07) with 10 field blocks fusing
  Unified JSONL, 6-field Task JSON, and Solution Schema advantages
- Migrate unified-execute-with-file from JSONL to .task/*.json directory scanning
- Migrate 3 producers (lite-plan, plan-converter, collaborative-plan) to
  .task/*.json multi-file output
- Add review-cycle Phase 7.5 export-to-tasks (FIX-*.json) and issue-resolve
  --export-tasks option
- Add schema compatibility annotations to action-planning-agent, workflow-plan,
  and tdd-plan
- Add spec-generator skill phases and templates
- Add memory v2 pipeline (consolidation, extraction, job scheduler, embedder)
- Add secret-redactor utility and core-memory enhancements
- Add codex-lens accuracy benchmarks and staged env config overrides
2026-02-11 17:40:56 +08:00

366 lines
12 KiB
Python

#!/usr/bin/env python
"""Compare labeled accuracy: staged(realtime LSP graph) vs dense_rerank.
This script measures retrieval "accuracy" against a labeled query set.
Each query must provide a list of relevant file paths (relative to --source
or absolute). We report:
- Hit@K (any relevant file appears in top-K)
- MRR@K (reciprocal rank of first relevant file within top-K)
- Recall@K (fraction of relevant files present in top-K)
Example:
python benchmarks/compare_accuracy_labeled.py --source ./src
python benchmarks/compare_accuracy_labeled.py --queries-file benchmarks/accuracy_queries_codexlens.jsonl
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
# Add src to path (match other benchmark scripts)
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_codexlens.jsonl"
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _normalize_path_key(path: str) -> str:
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
try:
p = Path(path)
# Don't explode on non-files like "<memory>".
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
norm = str(p.resolve())
else:
norm = str(p)
except Exception:
norm = path
norm = norm.replace("/", "\\")
if os.name == "nt":
norm = norm.lower()
return norm
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
if not path.is_file():
raise SystemExit(f"Queries file does not exist: {path}")
out: List[Dict[str, Any]] = []
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
try:
item = json.loads(line)
except Exception as exc:
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
if not isinstance(item, dict) or "query" not in item:
raise SystemExit(f"Invalid query item (expected object with 'query'): {item!r}")
out.append(item)
if limit is not None and len(out) >= limit:
break
return out
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
out: List[str] = []
seen: set[str] = set()
for p in paths:
if p in seen:
continue
seen.add(p)
out.append(p)
if len(out) >= k:
break
return out
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
for i, p in enumerate(topk_paths, start=1):
if p in relevant:
return i
return None
@dataclass
class StrategyRun:
strategy: str
latency_ms: float
topk_paths: List[str]
first_hit_rank: Optional[int]
hit_at_k: bool
recall_at_k: float
error: Optional[str] = None
@dataclass
class QueryEval:
query: str
relevant_paths: List[str]
staged: StrategyRun
dense_rerank: StrategyRun
def _run_strategy(
engine: ChainSearchEngine,
*,
strategy: str,
query: str,
source_path: Path,
k: int,
coarse_k: int,
relevant: set[str],
options: Optional[SearchOptions] = None,
) -> StrategyRun:
gc.collect()
start_ms = _now_ms()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy=strategy,
)
latency_ms = _now_ms() - start_ms
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
paths_norm = [_normalize_path_key(p) for p in paths_raw]
topk = _dedup_topk(paths_norm, k=k)
rank = _first_hit_rank(topk, relevant)
hit = rank is not None
recall = 0.0
if relevant:
recall = len(set(topk) & relevant) / float(len(relevant))
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=topk,
first_hit_rank=rank,
hit_at_k=hit,
recall_at_k=recall,
error=None,
)
except Exception as exc:
latency_ms = _now_ms() - start_ms
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=[],
first_hit_rank=None,
hit_at_k=False,
recall_at_k=0.0,
error=repr(exc),
)
def _mrr(ranks: Sequence[Optional[int]]) -> float:
vals = []
for r in ranks:
if r is None or r <= 0:
vals.append(0.0)
else:
vals.append(1.0 / float(r))
return statistics.mean(vals) if vals else 0.0
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare labeled retrieval accuracy: staged(realtime) vs dense_rerank"
)
parser.add_argument(
"--source",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries-file",
type=Path,
default=DEFAULT_QUERIES_FILE,
help="JSONL file with {query, relevant_paths[]} per line",
)
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
parser.add_argument("--k", type=int, default=10, help="Top-K for evaluation (default 10)")
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
parser.add_argument(
"--staged-cluster-strategy",
type=str,
default="path",
help="Config.staged_clustering_strategy override for staged (default: path)",
)
parser.add_argument(
"--stage2-mode",
type=str,
default="realtime",
help="Config.staged_stage2_mode override for staged (default: realtime)",
)
parser.add_argument(
"--output",
type=Path,
default=Path(__file__).parent / "results" / "accuracy_labeled.json",
help="Output JSON path",
)
args = parser.parse_args()
if not args.source.exists():
raise SystemExit(f"Source path does not exist: {args.source}")
labeled = _load_labeled_queries(args.queries_file, args.queries)
if not labeled:
raise SystemExit("No queries to run")
source_root = args.source.expanduser().resolve()
# Match CLI behavior: load settings + apply global/workspace .env overrides.
config = Config.load()
config.cascade_strategy = "staged"
config.staged_stage2_mode = str(args.stage2_mode or "realtime").strip().lower()
config.enable_staged_rerank = True
config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower()
# Stability: on some Windows setups, DirectML/ONNX can crash under load.
config.embedding_use_gpu = False
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
def resolve_expected(paths: Sequence[str]) -> set[str]:
out: set[str] = set()
for p in paths:
try:
cand = Path(p)
if not cand.is_absolute():
cand = (source_root / cand).resolve()
out.add(_normalize_path_key(str(cand)))
except Exception:
out.add(_normalize_path_key(p))
return out
evaluations: List[QueryEval] = []
try:
for i, item in enumerate(labeled, start=1):
query = str(item.get("query", "")).strip()
relevant_raw = item.get("relevant_paths") or []
if not query:
continue
if not isinstance(relevant_raw, list) or not relevant_raw:
raise SystemExit(f"Query item missing relevant_paths[]: {item!r}")
relevant = resolve_expected([str(p) for p in relevant_raw])
print(f"[{i}/{len(labeled)}] {query}")
staged = _run_strategy(
engine,
strategy="staged",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
dense = _run_strategy(
engine,
strategy="dense_rerank",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
evaluations.append(
QueryEval(
query=query,
relevant_paths=[_normalize_path_key(str((source_root / p).resolve())) if not Path(p).is_absolute() else _normalize_path_key(p) for p in relevant_raw],
staged=staged,
dense_rerank=dense,
)
)
finally:
try:
engine.close()
except Exception:
pass
try:
registry.close()
except Exception:
pass
staged_runs = [e.staged for e in evaluations]
dense_runs = [e.dense_rerank for e in evaluations]
def mean(xs: Sequence[float]) -> float:
return statistics.mean(xs) if xs else 0.0
staged_ranks = [r.first_hit_rank for r in staged_runs]
dense_ranks = [r.first_hit_rank for r in dense_runs]
summary = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(evaluations),
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"staged": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in staged_runs]),
"mrr_at_k": _mrr(staged_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in staged_runs]),
"avg_latency_ms": mean([r.latency_ms for r in staged_runs if not r.error]),
"errors": sum(1 for r in staged_runs if r.error),
},
"dense_rerank": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in dense_runs]),
"mrr_at_k": _mrr(dense_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in dense_runs]),
"avg_latency_ms": mean([r.latency_ms for r in dense_runs if not r.error]),
"errors": sum(1 for r in dense_runs if r.error),
},
"config": {
"staged_stage2_mode": config.staged_stage2_mode,
"staged_clustering_strategy": config.staged_clustering_strategy,
"enable_staged_rerank": bool(config.enable_staged_rerank),
"reranker_backend": config.reranker_backend,
"reranker_model": config.reranker_model,
"embedding_backend": config.embedding_backend,
"embedding_model": config.embedding_model,
},
}
payload = {"summary": summary, "evaluations": [asdict(e) for e in evaluations]}
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print("\n=== SUMMARY ===")
print(json.dumps(summary, indent=2))
print(f"\nSaved: {args.output}")
if __name__ == "__main__":
main()