feat: 添加配置选项以调整重排序模型的权重和测试文件惩罚，增强语义搜索功能

2026-02-11 02:33:51 +08:00 · 2026-01-13 10:44:26 +08:00
parent bf06f4ddcc
commit 8c2d39d517
9 changed files with 1043 additions and 23 deletions
--- a/codex-lens/debug_semantic_v2.py
+++ b/codex-lens/debug_semantic_v2.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python
+"""Debug script v2: Trace the full semantic search flow with detailed logging."""
+
+import json
+import logging
+import sqlite3
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-5s | %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("debug")
+
+
+def count_chunks_by_category(index_root: Path) -> Dict[str, int]:
+    """Count chunks by category (src vs test) across all indexes."""
+    counts = defaultdict(int)
+    
+    for db_path in index_root.rglob("_index.db"):
+        try:
+            conn = sqlite3.connect(str(db_path))
+            cursor = conn.execute("""
+                SELECT file_path FROM semantic_chunks
+            """)
+            for row in cursor:
+                path = row[0]
+                if "tests" in path or "test_" in Path(path).name:
+                    counts["test"] += 1
+                else:
+                    counts["src"] += 1
+            conn.close()
+        except:
+            pass
+    
+    return dict(counts)
+
+
+def run_dense_search_with_trace(query: str, source_path: Path) -> List[Dict]:
+    """Run dense search with detailed tracing."""
+    from codexlens.config import Config
+    from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
+    from codexlens.storage.registry import Registry
+    from codexlens.storage.path_mapper import PathMapper
+    
+    # Load config
+    config = Config.load()
+    registry = Registry(config.data_dir)
+    mapper = PathMapper(config.data_dir)
+    
+    # Create search engine with verbose logging
+    engine = ChainSearchEngine(registry, mapper, config=config)
+    engine.logger.setLevel(logging.DEBUG)
+    
+    # Set up handler to capture all log output
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.DEBUG)
+    engine.logger.addHandler(handler)
+    
+    # Execute cascade search with dense_rerank strategy
+    options = SearchOptions(depth=-1)  # Search all subdirectories
+    
+    logger.info("=" * 70)
+    logger.info("Executing dense_rerank cascade search...")
+    logger.info(f"Query: {query}")
+    logger.info(f"Source: {source_path}")
+    logger.info("=" * 70)
+    
+    result = engine.cascade_search(
+        query=query,
+        source_path=source_path,
+        k=20,
+        coarse_k=100,
+        options=options,
+        strategy="dense_rerank"
+    )
+    
+    # Analyze results
+    logger.info("\n" + "=" * 70)
+    logger.info("SEARCH RESULTS ANALYSIS")
+    logger.info("=" * 70)
+    
+    test_count = 0
+    src_count = 0
+    results_detail = []
+    
+    for i, r in enumerate(result.results):
+        is_test = "tests" in r.path or "test_" in Path(r.path).name
+        if is_test:
+            test_count += 1
+            category = "TEST"
+        else:
+            src_count += 1
+            category = "SRC"
+        
+        # Get metadata scores if available
+        pre_ce_score = r.metadata.get("pre_cross_encoder_score", r.score)
+        ce_score = r.metadata.get("cross_encoder_score", 0)
+        ce_prob = r.metadata.get("cross_encoder_prob", 0)
+        
+        results_detail.append({
+            "rank": i + 1,
+            "category": category,
+            "path": r.path,
+            "score": r.score,
+            "pre_ce_score": pre_ce_score,
+            "ce_score": ce_score,
+            "ce_prob": ce_prob,
+            "excerpt": r.excerpt[:100] if r.excerpt else "",
+        })
+        
+        logger.info(f"{i+1:2d}. [{category:4s}] score={r.score:.4f} pre_ce={pre_ce_score:.4f} ce={ce_score:.4f}")
+        logger.info(f"    {r.path}")
+        if r.excerpt:
+            logger.info(f"    {r.excerpt[:80]}...")
+        logger.info("")
+    
+    logger.info(f"\nSummary: {src_count} SRC files, {test_count} TEST files in top {len(result.results)}")
+    logger.info(f"Search time: {result.stats.time_ms:.2f}ms")
+    
+    return results_detail
+
+
+def compare_coarse_candidates():
+    """Compare coarse candidates before and after reranking."""
+    from codexlens.config import Config
+    from codexlens.semantic.factory import get_embedder
+    from codexlens.semantic.ann_index import ANNIndex
+    
+    query = "文件索引和嵌入向量生成的实现逻辑"
+    config = Config.load()
+    
+    # Generate query embedding
+    embedder = get_embedder(backend="litellm", model="qwen3-embedding-sf")
+    query_embedding = embedder.embed_to_numpy([query])[0]
+    
+    logger.info("=" * 70)
+    logger.info("COARSE CANDIDATE ANALYSIS (per directory)")
+    logger.info("=" * 70)
+    
+    # Scan all HNSW indexes
+    index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
+    
+    all_candidates = []
+    
+    for hnsw_path in index_root.rglob("_index_vectors.hnsw"):
+        db_path = hnsw_path.parent / "_index.db"
+        if not db_path.exists():
+            continue
+        
+        try:
+            ann_index = ANNIndex(db_path, dim=query_embedding.shape[0])
+            if not ann_index.load() or ann_index.count() == 0:
+                continue
+            
+            ids, distances = ann_index.search(query_embedding, top_k=10)
+            
+            # Get file paths from chunks
+            conn = sqlite3.connect(str(db_path))
+            conn.row_factory = sqlite3.Row
+            
+            dir_name = hnsw_path.parent.relative_to(index_root)
+            
+            for chunk_id, dist in zip(ids, distances):
+                cursor = conn.execute("""
+                    SELECT file_path, content FROM semantic_chunks WHERE id = ?
+                """, (int(chunk_id),))
+                row = cursor.fetchone()
+                if row:
+                    is_test = "tests" in row["file_path"] or "test_" in Path(row["file_path"]).name
+                    all_candidates.append({
+                        "dir": str(dir_name),
+                        "chunk_id": int(chunk_id),
+                        "distance": float(dist),
+                        "score": max(0, 1 - float(dist)),
+                        "is_test": is_test,
+                        "file_path": row["file_path"],
+                        "content_preview": row["content"][:100] if row["content"] else ""
+                    })
+            conn.close()
+            
+        except Exception as e:
+            logger.warning(f"Error processing {hnsw_path}: {e}")
+    
+    # Sort by distance (closest first)
+    all_candidates.sort(key=lambda x: x["distance"])
+    
+    logger.info(f"\nTotal coarse candidates across all directories: {len(all_candidates)}")
+    
+    # Analyze distribution
+    test_candidates = [c for c in all_candidates if c["is_test"]]
+    src_candidates = [c for c in all_candidates if not c["is_test"]]
+    
+    logger.info(f"Test files: {len(test_candidates)}")
+    logger.info(f"Src files: {len(src_candidates)}")
+    
+    if test_candidates:
+        avg_test_dist = sum(c["distance"] for c in test_candidates) / len(test_candidates)
+        logger.info(f"Avg test distance: {avg_test_dist:.4f}")
+    if src_candidates:
+        avg_src_dist = sum(c["distance"] for c in src_candidates) / len(src_candidates)
+        logger.info(f"Avg src distance: {avg_src_dist:.4f}")
+    
+    logger.info("\nTop 30 candidates (combined from all directories):")
+    logger.info("-" * 90)
+    for i, c in enumerate(all_candidates[:30]):
+        cat = "TEST" if c["is_test"] else "SRC"
+        logger.info(f"{i+1:2d}. [{cat:4s}] dist={c['distance']:.4f} score={c['score']:.4f} dir={c['dir']}")
+        logger.info(f"    {Path(c['file_path']).name}")
+    
+    return all_candidates
+
+
+def main():
+    logger.info("=" * 70)
+    logger.info("SEMANTIC SEARCH DEBUG SESSION")
+    logger.info("=" * 70)
+    
+    # Step 1: Count chunks distribution
+    index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
+    counts = count_chunks_by_category(index_root)
+    logger.info(f"\nChunk distribution in index:")
+    logger.info(f"  - Test chunks: {counts.get('test', 0)}")
+    logger.info(f"  - Src chunks: {counts.get('src', 0)}")
+    
+    # Step 2: Compare coarse candidates
+    logger.info("\n")
+    candidates = compare_coarse_candidates()
+    
+    # Step 3: Run full search
+    logger.info("\n")
+    query = "文件索引和嵌入向量生成的实现逻辑"
+    source_path = Path(r"D:\Claude_dms3\codex-lens")
+    results = run_dense_search_with_trace(query, source_path)
+    
+    # Summary
+    logger.info("\n" + "=" * 70)
+    logger.info("ROOT CAUSE ANALYSIS")
+    logger.info("=" * 70)
+    
+    test_in_top10 = sum(1 for r in results[:10] if r["category"] == "TEST")
+    src_in_top10 = 10 - test_in_top10
+    
+    logger.info(f"\nTop 10 results: {src_in_top10} SRC, {test_in_top10} TEST")
+    
+    if test_in_top10 > src_in_top10:
+        logger.info("\nPROBLEM: Test files dominate top results")
+        logger.info("\nPossible causes:")
+        logger.info("  1. Test files mention implementation concepts explicitly")
+        logger.info("     (e.g., docstrings describe what they test)")
+        logger.info("  2. Embedding model treats test descriptions as similar to")
+        logger.info("     implementation descriptions")
+        logger.info("  3. Cross-encoder reranker gives higher scores to")
+        logger.info("     descriptive test content over implementation code")
+        
+        # Check if coarse candidates already favor tests
+        test_in_coarse_top30 = sum(1 for c in candidates[:30] if c["is_test"])
+        if test_in_coarse_top30 > 15:
+            logger.info(f"\n  → Dense coarse search already favors tests")
+            logger.info(f"     ({test_in_coarse_top30}/30 test files in coarse top-30)")
+            logger.info(f"     Problem is at EMBEDDING/DENSE SEARCH stage")
+        else:
+            logger.info(f"\n  → Coarse search is balanced ({test_in_coarse_top30}/30 tests)")
+            logger.info(f"     Problem is at CROSS-ENCODER RERANKING stage")
+
+
+if __name__ == "__main__":
+    main()