Claude-Code-Workflow/codex-lens/scripts/test_misleading_comments.py

#!/usr/bin/env python3
"""Test pure vector vs LLM-enhanced search with misleading/missing comments.

This test demonstrates how LLM enhancement can overcome:
1. Missing comments/docstrings
2. Misleading or incorrect comments
3. Outdated documentation

Usage:
    python test_misleading_comments.py --tool gemini
"""

import argparse
import sqlite3
import sys
import tempfile
import time
from pathlib import Path
from typing import Dict, List

# Check dependencies
try:
    from codexlens.semantic import SEMANTIC_AVAILABLE
    from codexlens.semantic.embedder import Embedder
    from codexlens.semantic.vector_store import VectorStore
    from codexlens.semantic.chunker import Chunker, ChunkConfig
    from codexlens.semantic.llm_enhancer import (
        LLMEnhancer,
        LLMConfig,
        FileData,
        EnhancedSemanticIndexer,
    )
    from codexlens.storage.dir_index import DirIndexStore
    from codexlens.search.hybrid_search import HybridSearchEngine
except ImportError as e:
    print(f"Error: Missing dependencies - {e}")
    print("Install with: pip install codexlens[semantic]")
    sys.exit(1)

if not SEMANTIC_AVAILABLE:
    print("Error: Semantic search dependencies not available")
    sys.exit(1)


# Test dataset with MISLEADING or MISSING comments
MISLEADING_DATASET = {
    "crypto/hasher.py": '''"""Simple string utilities."""
import bcrypt

def process_string(s: str, rounds: int = 12) -> str:
    """Convert string to uppercase."""
    salt = bcrypt.gensalt(rounds=rounds)
    hashed = bcrypt.hashpw(s.encode('utf-8'), salt)
    return hashed.decode('utf-8')

def check_string(s: str, target: str) -> bool:
    """Check if two strings are equal."""
    return bcrypt.checkpw(s.encode('utf-8'), target.encode('utf-8'))
''',

    "auth/token.py": '''import jwt
from datetime import datetime, timedelta

SECRET_KEY = "key123"

def make_thing(uid: int, exp: int = 3600) -> str:
    payload = {
        'user_id': uid,
        'exp': datetime.utcnow() + timedelta(seconds=exp),
        'iat': datetime.utcnow()
    }
    return jwt.encode(payload, SECRET_KEY, algorithm='HS256')

def parse_thing(thing: str) -> dict:
    try:
        return jwt.decode(thing, SECRET_KEY, algorithms=['HS256'])
    except jwt.ExpiredSignatureError:
        return None
''',

    "api/handlers.py": '''"""Database connection utilities."""
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/api/items', methods=['POST'])
def create_item():
    """Delete an existing item."""
    data = request.get_json()
    if not data.get('email') or not data.get('password'):
        return jsonify({'error': 'Missing data'}), 400
    item_id = 12345
    return jsonify({'item_id': item_id, 'success': True}), 201

@app.route('/api/items/<int:item_id>', methods=['GET'])
def get_item(item_id: int):
    """Update item configuration."""
    item = {
        'id': item_id,
        'email': 'user@example.com',
        'name': 'John Doe'
    }
    return jsonify(item), 200
''',

    "utils/checker.py": '''"""Math calculation functions."""
import re

def calc_sum(email: str) -> bool:
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

def format_text(text: str, max_len: int = 255) -> str:
    text = re.sub(r'[<>"\\'&]', '', text)
    return text.strip()[:max_len]
''',

    "db/pool.py": '''"""Email sending service."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager

class EmailSender:
    """SMTP email sender with retry logic."""

    def __init__(self, min_conn: int = 1, max_conn: int = 10):
        """Initialize email sender."""
        self.pool = psycopg2.pool.SimpleConnectionPool(
            min_conn, max_conn,
            user='dbuser', host='localhost', database='myapp'
        )

    @contextmanager
    def send_email(self):
        """Send email message."""
        conn = self.pool.getconn()
        try:
            yield conn
            conn.commit()
        finally:
            self.pool.putconn(conn)
''',
}


# Test queries - natural language based on ACTUAL functionality (not misleading comments)
TEST_QUERIES = [
    ("How to hash passwords securely with bcrypt?", "crypto/hasher.py"),
    ("Generate JWT authentication token", "auth/token.py"),
    ("Create user account REST API endpoint", "api/handlers.py"),
    ("Validate email address format", "utils/checker.py"),
    ("PostgreSQL database connection pool", "db/pool.py"),
]


def create_test_database(db_path: Path) -> None:
    """Create and populate test database."""
    store = DirIndexStore(db_path)
    store.initialize()

    with store._get_connection() as conn:
        for path, content in MISLEADING_DATASET.items():
            name = path.split('/')[-1]
            conn.execute(
                """INSERT INTO files (name, full_path, content, language, mtime)
                   VALUES (?, ?, ?, ?, ?)""",
                (name, path, content, "python", 0.0)
            )
        conn.commit()

    store.close()


def test_pure_vector_search(db_path: Path) -> Dict:
    """Test pure vector search (relies on code + misleading comments)."""
    print("\n" + "="*70)
    print("PURE VECTOR SEARCH (Code + Misleading Comments -> fastembed)")
    print("="*70)

    start_time = time.time()

    # Generate pure vector embeddings
    embedder = Embedder(profile="code")
    vector_store = VectorStore(db_path)
    chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))

    with sqlite3.connect(db_path) as conn:
        conn.row_factory = sqlite3.Row
        rows = conn.execute("SELECT full_path, content FROM files").fetchall()

    chunk_count = 0
    for row in rows:
        chunks = chunker.chunk_sliding_window(
            row["content"],
            file_path=row["full_path"],
            language="python"
        )
        for chunk in chunks:
            chunk.embedding = embedder.embed_single(chunk.content)
            chunk.metadata["strategy"] = "pure_vector"
        if chunks:
            vector_store.add_chunks(chunks, row["full_path"])
            chunk_count += len(chunks)

    setup_time = time.time() - start_time
    print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s")
    print("Note: Embeddings include misleading comments")

    # Test queries
    engine = HybridSearchEngine()
    results = {}

    print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
    print("-" * 70)

    for query, expected_file in TEST_QUERIES:
        search_results = engine.search(
            db_path,
            query,
            limit=3,
            enable_vector=True,
            pure_vector=True,
        )

        top_file = search_results[0].path if search_results else "No results"
        top_score = search_results[0].score if search_results else 0.0
        found = expected_file in [r.path for r in search_results]
        rank = None
        if found:
            for i, r in enumerate(search_results):
                if r.path == expected_file:
                    rank = i + 1
                    break

        status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
        display_query = query[:42] + "..." if len(query) > 45 else query
        display_file = top_file.split('/')[-1] if '/' in top_file else top_file

        print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")

        results[query] = {
            "found": found,
            "rank": rank,
            "top_file": top_file,
            "score": top_score,
        }

    return results


def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict:
    """Test LLM-enhanced search (LLM reads code and generates accurate summary)."""
    print("\n" + "="*70)
    print(f"LLM-ENHANCED SEARCH (Code -> {llm_tool.upper()} Analysis -> fastembed)")
    print("="*70)

    # Check CCW availability
    llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2)
    enhancer = LLMEnhancer(llm_config)

    if not enhancer.check_available():
        print("[X] CCW CLI not available - skipping LLM-enhanced test")
        print("  Install CCW: npm install -g ccw")
        return {}

    start_time = time.time()

    # Generate LLM-enhanced embeddings
    embedder = Embedder(profile="code")
    vector_store = VectorStore(db_path)
    indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)

    # Prepare file data
    file_data_list = [
        FileData(path=path, content=content, language="python")
        for path, content in MISLEADING_DATASET.items()
    ]

    # Index with LLM enhancement
    print(f"LLM analyzing code (ignoring misleading comments)...")
    indexed = indexer.index_files(file_data_list)
    setup_time = time.time() - start_time

    print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s")
    print("Note: LLM generates summaries based on actual code logic")

    # Test queries
    engine = HybridSearchEngine()
    results = {}

    print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
    print("-" * 70)

    for query, expected_file in TEST_QUERIES:
        search_results = engine.search(
            db_path,
            query,
            limit=3,
            enable_vector=True,
            pure_vector=True,
        )

        top_file = search_results[0].path if search_results else "No results"
        top_score = search_results[0].score if search_results else 0.0
        found = expected_file in [r.path for r in search_results]
        rank = None
        if found:
            for i, r in enumerate(search_results):
                if r.path == expected_file:
                    rank = i + 1
                    break

        status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
        display_query = query[:42] + "..." if len(query) > 45 else query
        display_file = top_file.split('/')[-1] if '/' in top_file else top_file

        print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")

        results[query] = {
            "found": found,
            "rank": rank,
            "top_file": top_file,
            "score": top_score,
        }

    return results


def compare_results(pure_results: Dict, llm_results: Dict) -> None:
    """Compare and analyze results from both approaches."""
    print("\n" + "="*70)
    print("COMPARISON SUMMARY - MISLEADING COMMENTS TEST")
    print("="*70)

    if not llm_results:
        print("Cannot compare - LLM-enhanced test was skipped")
        return

    pure_score = 0
    llm_score = 0

    print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}")
    print("-" * 70)

    for query, expected_file in TEST_QUERIES:
        pure_res = pure_results.get(query, {})
        llm_res = llm_results.get(query, {})

        pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss"
        llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss"

        # Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point
        if pure_res.get('found') and pure_res.get('rank'):
            pure_score += max(0, 4 - pure_res['rank'])
        if llm_res.get('found') and llm_res.get('rank'):
            llm_score += max(0, 4 - llm_res['rank'])

        display_query = query[:42] + "..." if len(query) > 45 else query
        print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}")

    print("-" * 70)
    print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}")
    print("="*70)

    # Analysis
    print("\nANALYSIS:")
    if llm_score > pure_score:
        improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
        print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
        print("  LLM understands actual code logic despite misleading comments")
        print("  Pure vector search misled by incorrect documentation")
    elif pure_score > llm_score:
        degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100
        print(f"[X] Pure vector performed {degradation:.1f}% better")
        print("  Unexpected: Pure vector wasn't affected by misleading comments")
    else:
        print("= Both approaches performed equally")
        print("  Test dataset may still be too simple to show differences")

    print("\nKEY INSIGHTS:")
    print("- Pure Vector: Embeds code + comments together, can be misled")
    print("- LLM Enhanced: Analyzes actual code behavior, ignores bad comments")
    print("- Best Use: LLM enhancement crucial for poorly documented codebases")

    print("\nMISLEADING COMMENTS IN TEST:")
    print("1. 'hasher.py' claims 'string utilities' but does bcrypt hashing")
    print("2. 'token.py' has no docstrings, unclear function names")
    print("3. 'handlers.py' says 'database utilities' but is REST API")
    print("4. 'handlers.py' docstrings opposite (create says delete, etc)")
    print("5. 'checker.py' claims 'math functions' but validates emails")
    print("6. 'pool.py' claims 'email sender' but is database pool")


def main():
    parser = argparse.ArgumentParser(
        description="Test pure vector vs LLM-enhanced with misleading comments"
    )
    parser.add_argument(
        "--tool",
        choices=["gemini", "qwen"],
        default="gemini",
        help="LLM tool to use (default: gemini)"
    )
    parser.add_argument(
        "--skip-llm",
        action="store_true",
        help="Skip LLM-enhanced test"
    )
    parser.add_argument(
        "--keep-db",
        type=str,
        help="Save database to specified path for inspection (e.g., ./test_results.db)"
    )
    args = parser.parse_args()

    print("\n" + "="*70)
    print("MISLEADING COMMENTS TEST")
    print("Pure Vector vs LLM-Enhanced with Incorrect Documentation")
    print("="*70)

    # Create test database
    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
        db_path = Path(f.name)

    try:
        print(f"\nTest dataset: {len(MISLEADING_DATASET)} Python files")
        print(f"Test queries: {len(TEST_QUERIES)} natural language questions")
        print("\nChallenges:")
        print("- Misleading module docstrings")
        print("- Incorrect function docstrings")
        print("- Missing documentation")
        print("- Unclear function names")

        create_test_database(db_path)

        # Test pure vector search
        pure_results = test_pure_vector_search(db_path)

        # Test LLM-enhanced search
        if not args.skip_llm:
            # Clear semantic_chunks table for LLM test
            with sqlite3.connect(db_path) as conn:
                conn.execute("DELETE FROM semantic_chunks")
                conn.commit()

            llm_results = test_llm_enhanced_search(db_path, args.tool)
        else:
            llm_results = {}
            print("\n[X] LLM-enhanced test skipped (--skip-llm flag)")

        # Compare results
        compare_results(pure_results, llm_results)

    finally:
        # Save or cleanup database
        if args.keep_db:
            import shutil
            save_path = Path(args.keep_db)
            try:
                import gc
                gc.collect()
                time.sleep(0.2)
                shutil.copy2(db_path, save_path)
                print(f"\n[OK] Database saved to: {save_path}")
                print(f"Inspect with: python scripts/inspect_llm_summaries.py {save_path}")
            except Exception as e:
                print(f"\n[X] Failed to save database: {e}")
            finally:
                try:
                    if db_path.exists():
                        db_path.unlink()
                except:
                    pass
        else:
            # Cleanup
            try:
                import gc
                gc.collect()
                time.sleep(0.1)
                if db_path.exists():
                    db_path.unlink()
            except PermissionError:
                print(f"\nWarning: Could not delete temporary database: {db_path}")

    print("\n" + "="*70)
    print("Test completed!")
    print("="*70)


if __name__ == "__main__":
    main()