Claude-Code-Workflow/codex-lens/debug_semantic_v2.py

#!/usr/bin/env python
"""Debug script v2: Trace the full semantic search flow with detailed logging."""

import json
import logging
import sqlite3
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Tuple

# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-5s | %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger("debug")


def count_chunks_by_category(index_root: Path) -> Dict[str, int]:
    """Count chunks by category (src vs test) across all indexes."""
    counts = defaultdict(int)

    for db_path in index_root.rglob("_index.db"):
        try:
            conn = sqlite3.connect(str(db_path))
            cursor = conn.execute("""
                SELECT file_path FROM semantic_chunks
            """)
            for row in cursor:
                path = row[0]
                if "tests" in path or "test_" in Path(path).name:
                    counts["test"] += 1
                else:
                    counts["src"] += 1
            conn.close()
        except:
            pass

    return dict(counts)


def run_dense_search_with_trace(query: str, source_path: Path) -> List[Dict]:
    """Run dense search with detailed tracing."""
    from codexlens.config import Config
    from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
    from codexlens.storage.registry import Registry
    from codexlens.storage.path_mapper import PathMapper

    # Load config
    config = Config.load()
    registry = Registry(config.data_dir)
    mapper = PathMapper(config.data_dir)

    # Create search engine with verbose logging
    engine = ChainSearchEngine(registry, mapper, config=config)
    engine.logger.setLevel(logging.DEBUG)

    # Set up handler to capture all log output
    handler = logging.StreamHandler()
    handler.setLevel(logging.DEBUG)
    engine.logger.addHandler(handler)

    # Execute cascade search with dense_rerank strategy
    options = SearchOptions(depth=-1)  # Search all subdirectories

    logger.info("=" * 70)
    logger.info("Executing dense_rerank cascade search...")
    logger.info(f"Query: {query}")
    logger.info(f"Source: {source_path}")
    logger.info("=" * 70)

    result = engine.cascade_search(
        query=query,
        source_path=source_path,
        k=20,
        coarse_k=100,
        options=options,
        strategy="dense_rerank"
    )

    # Analyze results
    logger.info("\n" + "=" * 70)
    logger.info("SEARCH RESULTS ANALYSIS")
    logger.info("=" * 70)

    test_count = 0
    src_count = 0
    results_detail = []

    for i, r in enumerate(result.results):
        is_test = "tests" in r.path or "test_" in Path(r.path).name
        if is_test:
            test_count += 1
            category = "TEST"
        else:
            src_count += 1
            category = "SRC"

        # Get metadata scores if available
        pre_ce_score = r.metadata.get("pre_cross_encoder_score", r.score)
        ce_score = r.metadata.get("cross_encoder_score", 0)
        ce_prob = r.metadata.get("cross_encoder_prob", 0)

        results_detail.append({
            "rank": i + 1,
            "category": category,
            "path": r.path,
            "score": r.score,
            "pre_ce_score": pre_ce_score,
            "ce_score": ce_score,
            "ce_prob": ce_prob,
            "excerpt": r.excerpt[:100] if r.excerpt else "",
        })

        logger.info(f"{i+1:2d}. [{category:4s}] score={r.score:.4f} pre_ce={pre_ce_score:.4f} ce={ce_score:.4f}")
        logger.info(f"    {r.path}")
        if r.excerpt:
            logger.info(f"    {r.excerpt[:80]}...")
        logger.info("")

    logger.info(f"\nSummary: {src_count} SRC files, {test_count} TEST files in top {len(result.results)}")
    logger.info(f"Search time: {result.stats.time_ms:.2f}ms")

    return results_detail


def compare_coarse_candidates():
    """Compare coarse candidates before and after reranking."""
    from codexlens.config import Config
    from codexlens.semantic.factory import get_embedder
    from codexlens.semantic.ann_index import ANNIndex

    query = "文件索引和嵌入向量生成的实现逻辑"
    config = Config.load()

    # Generate query embedding
    embedder = get_embedder(backend="litellm", model="qwen3-embedding-sf")
    query_embedding = embedder.embed_to_numpy([query])[0]

    logger.info("=" * 70)
    logger.info("COARSE CANDIDATE ANALYSIS (per directory)")
    logger.info("=" * 70)

    # Scan all HNSW indexes
    index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")

    all_candidates = []

    for hnsw_path in index_root.rglob("_index_vectors.hnsw"):
        db_path = hnsw_path.parent / "_index.db"
        if not db_path.exists():
            continue

        try:
            ann_index = ANNIndex(db_path, dim=query_embedding.shape[0])
            if not ann_index.load() or ann_index.count() == 0:
                continue

            ids, distances = ann_index.search(query_embedding, top_k=10)

            # Get file paths from chunks
            conn = sqlite3.connect(str(db_path))
            conn.row_factory = sqlite3.Row

            dir_name = hnsw_path.parent.relative_to(index_root)

            for chunk_id, dist in zip(ids, distances):
                cursor = conn.execute("""
                    SELECT file_path, content FROM semantic_chunks WHERE id = ?
                """, (int(chunk_id),))
                row = cursor.fetchone()
                if row:
                    is_test = "tests" in row["file_path"] or "test_" in Path(row["file_path"]).name
                    all_candidates.append({
                        "dir": str(dir_name),
                        "chunk_id": int(chunk_id),
                        "distance": float(dist),
                        "score": max(0, 1 - float(dist)),
                        "is_test": is_test,
                        "file_path": row["file_path"],
                        "content_preview": row["content"][:100] if row["content"] else ""
                    })
            conn.close()

        except Exception as e:
            logger.warning(f"Error processing {hnsw_path}: {e}")

    # Sort by distance (closest first)
    all_candidates.sort(key=lambda x: x["distance"])

    logger.info(f"\nTotal coarse candidates across all directories: {len(all_candidates)}")

    # Analyze distribution
    test_candidates = [c for c in all_candidates if c["is_test"]]
    src_candidates = [c for c in all_candidates if not c["is_test"]]

    logger.info(f"Test files: {len(test_candidates)}")
    logger.info(f"Src files: {len(src_candidates)}")

    if test_candidates:
        avg_test_dist = sum(c["distance"] for c in test_candidates) / len(test_candidates)
        logger.info(f"Avg test distance: {avg_test_dist:.4f}")
    if src_candidates:
        avg_src_dist = sum(c["distance"] for c in src_candidates) / len(src_candidates)
        logger.info(f"Avg src distance: {avg_src_dist:.4f}")

    logger.info("\nTop 30 candidates (combined from all directories):")
    logger.info("-" * 90)
    for i, c in enumerate(all_candidates[:30]):
        cat = "TEST" if c["is_test"] else "SRC"
        logger.info(f"{i+1:2d}. [{cat:4s}] dist={c['distance']:.4f} score={c['score']:.4f} dir={c['dir']}")
        logger.info(f"    {Path(c['file_path']).name}")

    return all_candidates


def main():
    logger.info("=" * 70)
    logger.info("SEMANTIC SEARCH DEBUG SESSION")
    logger.info("=" * 70)

    # Step 1: Count chunks distribution
    index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
    counts = count_chunks_by_category(index_root)
    logger.info(f"\nChunk distribution in index:")
    logger.info(f"  - Test chunks: {counts.get('test', 0)}")
    logger.info(f"  - Src chunks: {counts.get('src', 0)}")

    # Step 2: Compare coarse candidates
    logger.info("\n")
    candidates = compare_coarse_candidates()

    # Step 3: Run full search
    logger.info("\n")
    query = "文件索引和嵌入向量生成的实现逻辑"
    source_path = Path(r"D:\Claude_dms3\codex-lens")
    results = run_dense_search_with_trace(query, source_path)

    # Summary
    logger.info("\n" + "=" * 70)
    logger.info("ROOT CAUSE ANALYSIS")
    logger.info("=" * 70)

    test_in_top10 = sum(1 for r in results[:10] if r["category"] == "TEST")
    src_in_top10 = 10 - test_in_top10

    logger.info(f"\nTop 10 results: {src_in_top10} SRC, {test_in_top10} TEST")

    if test_in_top10 > src_in_top10:
        logger.info("\nPROBLEM: Test files dominate top results")
        logger.info("\nPossible causes:")
        logger.info("  1. Test files mention implementation concepts explicitly")
        logger.info("     (e.g., docstrings describe what they test)")
        logger.info("  2. Embedding model treats test descriptions as similar to")
        logger.info("     implementation descriptions")
        logger.info("  3. Cross-encoder reranker gives higher scores to")
        logger.info("     descriptive test content over implementation code")

        # Check if coarse candidates already favor tests
        test_in_coarse_top30 = sum(1 for c in candidates[:30] if c["is_test"])
        if test_in_coarse_top30 > 15:
            logger.info(f"\n  → Dense coarse search already favors tests")
            logger.info(f"     ({test_in_coarse_top30}/30 test files in coarse top-30)")
            logger.info(f"     Problem is at EMBEDDING/DENSE SEARCH stage")
        else:
            logger.info(f"\n  → Coarse search is balanced ({test_in_coarse_top30}/30 tests)")
            logger.info(f"     Problem is at CROSS-ENCODER RERANKING stage")


if __name__ == "__main__":
    main()