Remove LLM enhancement features and related components as per user request. This includes the deletion of source code files, CLI commands, front-end components, tests, scripts, and documentation associated with LLM functionality. Simplified dependencies and reduced complexity while retaining core vector search capabilities. Validation of changes confirmed successful removal and functionality.

This commit is contained in:
catlog22
2025-12-16 21:38:27 +08:00
parent d21066c282
commit b702791c2c
21 changed files with 375 additions and 7193 deletions

View File

@@ -1,465 +0,0 @@
#!/usr/bin/env python3
"""Standalone script to compare pure vector vs LLM-enhanced semantic search.
Usage:
python compare_search_methods.py [--tool gemini|qwen] [--skip-llm]
This script:
1. Creates a test dataset with sample code
2. Tests pure vector search (code → fastembed → search)
3. Tests LLM-enhanced search (code → LLM summary → fastembed → search)
4. Compares results across natural language queries
"""
import argparse
import sqlite3
import sys
import tempfile
import time
from pathlib import Path
from typing import Dict, List, Tuple
# Check dependencies
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.llm_enhancer import (
LLMEnhancer,
LLMConfig,
FileData,
EnhancedSemanticIndexer,
)
from codexlens.storage.dir_index import DirIndexStore
from codexlens.search.hybrid_search import HybridSearchEngine
except ImportError as e:
print(f"Error: Missing dependencies - {e}")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
if not SEMANTIC_AVAILABLE:
print("Error: Semantic search dependencies not available")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
# Test dataset with realistic code samples
TEST_DATASET = {
"auth/password_hasher.py": '''"""Password hashing utilities using bcrypt."""
import bcrypt
def hash_password(password: str, salt_rounds: int = 12) -> str:
"""Hash a password using bcrypt with specified salt rounds."""
salt = bcrypt.gensalt(rounds=salt_rounds)
hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
return hashed.decode('utf-8')
def verify_password(password: str, hashed: str) -> bool:
"""Verify a password against its hash."""
return bcrypt.checkpw(password.encode('utf-8'), hashed.encode('utf-8'))
''',
"auth/jwt_handler.py": '''"""JWT token generation and validation."""
import jwt
from datetime import datetime, timedelta
SECRET_KEY = "your-secret-key"
def create_token(user_id: int, expires_in: int = 3600) -> str:
"""Generate a JWT access token for user authentication."""
payload = {
'user_id': user_id,
'exp': datetime.utcnow() + timedelta(seconds=expires_in),
'iat': datetime.utcnow()
}
return jwt.encode(payload, SECRET_KEY, algorithm='HS256')
def decode_token(token: str) -> dict:
"""Validate and decode JWT token."""
try:
return jwt.decode(token, SECRET_KEY, algorithms=['HS256'])
except jwt.ExpiredSignatureError:
return None
''',
"api/user_endpoints.py": '''"""REST API endpoints for user management."""
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/api/users', methods=['POST'])
def create_user():
"""Create a new user account with email and password."""
data = request.get_json()
if not data.get('email') or not data.get('password'):
return jsonify({'error': 'Email and password required'}), 400
user_id = 12345 # Database insert
return jsonify({'user_id': user_id, 'success': True}), 201
@app.route('/api/users/<int:user_id>', methods=['GET'])
def get_user(user_id: int):
"""Retrieve user profile information by user ID."""
user = {
'id': user_id,
'email': 'user@example.com',
'name': 'John Doe'
}
return jsonify(user), 200
''',
"utils/validation.py": '''"""Input validation utilities."""
import re
def validate_email(email: str) -> bool:
"""Check if email address format is valid using regex."""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def sanitize_input(text: str, max_length: int = 255) -> str:
"""Clean user input by removing special characters."""
text = re.sub(r'[<>\"\'&]', '', text)
return text.strip()[:max_length]
def validate_password_strength(password: str) -> tuple:
"""Validate password meets security requirements."""
if len(password) < 8:
return False, "Password must be at least 8 characters"
if not re.search(r'[A-Z]', password):
return False, "Must contain uppercase letter"
return True, None
''',
"database/connection.py": '''"""Database connection pooling."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class DatabasePool:
"""PostgreSQL connection pool manager."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize database connection pool."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def get_connection(self):
"""Get a connection from pool as context manager."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
''',
}
# Natural language test queries
TEST_QUERIES = [
("How do I securely hash passwords?", "auth/password_hasher.py"),
("Generate JWT token for authentication", "auth/jwt_handler.py"),
("Create new user account via API", "api/user_endpoints.py"),
("Validate email address format", "utils/validation.py"),
("Connect to PostgreSQL database", "database/connection.py"),
]
def create_test_database(db_path: Path) -> None:
"""Create and populate test database."""
store = DirIndexStore(db_path)
store.initialize()
with store._get_connection() as conn:
for path, content in TEST_DATASET.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
store.close()
def test_pure_vector_search(db_path: Path) -> Dict:
"""Test pure vector search (raw code embeddings)."""
print("\n" + "="*70)
print("PURE VECTOR SEARCH (Code → fastembed)")
print("="*70)
start_time = time.time()
# Generate pure vector embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
chunk_count = 0
for row in rows:
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
chunk.metadata["strategy"] = "pure_vector"
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
chunk_count += len(chunks)
setup_time = time.time() - start_time
print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict:
"""Test LLM-enhanced search (LLM summaries → fastembed)."""
print("\n" + "="*70)
print(f"LLM-ENHANCED SEARCH (Code → {llm_tool.upper()} → fastembed)")
print("="*70)
# Check CCW availability
llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2)
enhancer = LLMEnhancer(llm_config)
if not enhancer.check_available():
print("[X] CCW CLI not available - skipping LLM-enhanced test")
print(" Install CCW: npm install -g ccw")
return {}
start_time = time.time()
# Generate LLM-enhanced embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)
# Prepare file data
file_data_list = [
FileData(path=path, content=content, language="python")
for path, content in TEST_DATASET.items()
]
# Index with LLM enhancement
print(f"Generating LLM summaries for {len(file_data_list)} files...")
indexed = indexer.index_files(file_data_list)
setup_time = time.time() - start_time
print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def compare_results(pure_results: Dict, llm_results: Dict) -> None:
"""Compare and analyze results from both approaches."""
print("\n" + "="*70)
print("COMPARISON SUMMARY")
print("="*70)
if not llm_results:
print("Cannot compare - LLM-enhanced test was skipped")
return
pure_score = 0
llm_score = 0
print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
pure_res = pure_results.get(query, {})
llm_res = llm_results.get(query, {})
pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss"
llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss"
# Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point
if pure_res.get('found') and pure_res.get('rank'):
pure_score += max(0, 4 - pure_res['rank'])
if llm_res.get('found') and llm_res.get('rank'):
llm_score += max(0, 4 - llm_res['rank'])
display_query = query[:42] + "..." if len(query) > 45 else query
print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}")
print("-" * 70)
print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}")
print("="*70)
# Analysis
print("\nANALYSIS:")
if llm_score > pure_score:
improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
print(" Natural language summaries match queries better than raw code")
elif pure_score > llm_score:
degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100
print(f"[X] Pure vector performed {degradation:.1f}% better")
print(" LLM summaries may be too generic or missing key details")
else:
print("= Both approaches performed equally on this test set")
print("\nKEY FINDINGS:")
print("- Pure Vector: Direct code embeddings, fast but may miss semantic intent")
print("- LLM Enhanced: Natural language summaries, better for human-like queries")
print("- Best Use: Combine both - LLM for natural language, vector for code patterns")
def main():
parser = argparse.ArgumentParser(
description="Compare pure vector vs LLM-enhanced semantic search"
)
parser.add_argument(
"--tool",
choices=["gemini", "qwen"],
default="gemini",
help="LLM tool to use for enhancement (default: gemini)"
)
parser.add_argument(
"--skip-llm",
action="store_true",
help="Skip LLM-enhanced test (only run pure vector)"
)
args = parser.parse_args()
print("\n" + "="*70)
print("SEMANTIC SEARCH COMPARISON TEST")
print("Pure Vector vs LLM-Enhanced Vector Search")
print("="*70)
# Create test database
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
try:
print(f"\nTest dataset: {len(TEST_DATASET)} Python files")
print(f"Test queries: {len(TEST_QUERIES)} natural language questions")
create_test_database(db_path)
# Test pure vector search
pure_results = test_pure_vector_search(db_path)
# Test LLM-enhanced search
if not args.skip_llm:
# Clear semantic_chunks table for LLM test
with sqlite3.connect(db_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
llm_results = test_llm_enhanced_search(db_path, args.tool)
else:
llm_results = {}
print("\n[X] LLM-enhanced test skipped (--skip-llm flag)")
# Compare results
compare_results(pure_results, llm_results)
finally:
# Cleanup - ensure all connections are closed
try:
import gc
gc.collect() # Force garbage collection to close any lingering connections
time.sleep(0.1) # Small delay for Windows to release file handle
if db_path.exists():
db_path.unlink()
except PermissionError:
print(f"\nWarning: Could not delete temporary database: {db_path}")
print("It will be cleaned up on next system restart.")
print("\n" + "="*70)
print("Test completed successfully!")
print("="*70)
if __name__ == "__main__":
main()

View File

@@ -1,88 +0,0 @@
#!/usr/bin/env python3
"""Inspect LLM-generated summaries in semantic_chunks table."""
import sqlite3
import sys
from pathlib import Path
def inspect_summaries(db_path: Path):
"""Show LLM-generated summaries from database."""
if not db_path.exists():
print(f"Error: Database not found: {db_path}")
return
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
# Check if semantic_chunks table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
if not cursor.fetchone():
print("No semantic_chunks table found")
return
# Get all chunks with metadata
cursor = conn.execute("""
SELECT file_path, chunk_index, content,
json_extract(metadata, '$.llm_summary') as summary,
json_extract(metadata, '$.llm_keywords') as keywords,
json_extract(metadata, '$.llm_purpose') as purpose,
json_extract(metadata, '$.strategy') as strategy
FROM semantic_chunks
ORDER BY file_path, chunk_index
""")
chunks = cursor.fetchall()
if not chunks:
print("No chunks found in database")
return
print("="*80)
print("LLM-GENERATED SUMMARIES INSPECTION")
print("="*80)
current_file = None
for chunk in chunks:
file_path = chunk['file_path']
if file_path != current_file:
print(f"\n{'='*80}")
print(f"FILE: {file_path}")
print(f"{'='*80}")
current_file = file_path
print(f"\n[Chunk {chunk['chunk_index']}]")
print(f"Strategy: {chunk['strategy']}")
if chunk['summary']:
print(f"\nLLM Summary:")
print(f" {chunk['summary']}")
if chunk['keywords']:
print(f"\nKeywords:")
print(f" {chunk['keywords']}")
if chunk['purpose']:
print(f"\nPurpose:")
print(f" {chunk['purpose']}")
# Show first 200 chars of content
content = chunk['content']
if len(content) > 200:
content = content[:200] + "..."
print(f"\nOriginal Content (first 200 chars):")
print(f" {content}")
print("-" * 80)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python inspect_llm_summaries.py <path_to_index.db>")
print("\nExample:")
print(" python inspect_llm_summaries.py ~/.codexlens/indexes/myproject/_index.db")
sys.exit(1)
db_path = Path(sys.argv[1])
inspect_summaries(db_path)

View File

@@ -1,112 +0,0 @@
#!/usr/bin/env python3
"""Directly show LLM analysis of test code."""
from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig, FileData
# Misleading code example
TEST_CODE = '''"""Email sending service."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class EmailSender:
"""SMTP email sender with retry logic."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize email sender."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def send_email(self):
"""Send email message."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
'''
print("="*80)
print("LLM ANALYSIS OF MISLEADING CODE")
print("="*80)
print("\n[Original Code with Misleading Comments]")
print("-"*80)
print(TEST_CODE)
print("-"*80)
print("\n[Actual Functionality]")
print(" - Imports: psycopg2 (PostgreSQL library)")
print(" - Class: EmailSender (but name is misleading!)")
print(" - Actually: Creates PostgreSQL connection pool")
print(" - Methods: send_email (actually gets DB connection)")
print("\n[Misleading Documentation]")
print(" - Module docstring: 'Email sending service' (WRONG)")
print(" - Class docstring: 'SMTP email sender' (WRONG)")
print(" - Method docstring: 'Send email message' (WRONG)")
print("\n" + "="*80)
print("TESTING LLM UNDERSTANDING")
print("="*80)
# Test LLM analysis
config = LLMConfig(enabled=True, tool="gemini", batch_size=1)
enhancer = LLMEnhancer(config)
if not enhancer.check_available():
print("\n[X] CCW CLI not available")
print("Install: npm install -g ccw")
exit(1)
print("\n[Calling Gemini to analyze code...]")
file_data = FileData(path="db/pool.py", content=TEST_CODE, language="python")
import tempfile
from pathlib import Path
with tempfile.TemporaryDirectory() as tmpdir:
result = enhancer.enhance_files([file_data], Path(tmpdir))
if "db/pool.py" in result:
metadata = result["db/pool.py"]
print("\n[LLM-Generated Summary]")
print("-"*80)
print(f"Summary: {metadata.summary}")
print(f"\nPurpose: {metadata.purpose}")
print(f"\nKeywords: {', '.join(metadata.keywords)}")
print("-"*80)
print("\n[Analysis]")
# Check if LLM identified the real functionality
summary_lower = metadata.summary.lower()
keywords_lower = [k.lower() for k in metadata.keywords]
correct_terms = ['database', 'postgresql', 'connection', 'pool', 'psycopg']
misleading_terms = ['email', 'smtp', 'send']
found_correct = sum(1 for term in correct_terms
if term in summary_lower or any(term in k for k in keywords_lower))
found_misleading = sum(1 for term in misleading_terms
if term in summary_lower or any(term in k for k in keywords_lower))
print(f"Correct terms found: {found_correct}/{len(correct_terms)}")
print(f"Misleading terms found: {found_misleading}/{len(misleading_terms)}")
if found_correct > found_misleading:
print("\n[OK] LLM correctly identified actual functionality!")
print(" LLM ignored misleading comments and analyzed code behavior")
elif found_misleading > found_correct:
print("\n[X] LLM was misled by incorrect comments")
print(" LLM trusted documentation over code analysis")
else:
print("\n[~] Mixed results - LLM found both correct and misleading terms")
else:
print("\n[X] LLM analysis failed - no results returned")
print("\n" + "="*80)

View File

@@ -1,491 +0,0 @@
#!/usr/bin/env python3
"""Test pure vector vs LLM-enhanced search with misleading/missing comments.
This test demonstrates how LLM enhancement can overcome:
1. Missing comments/docstrings
2. Misleading or incorrect comments
3. Outdated documentation
Usage:
python test_misleading_comments.py --tool gemini
"""
import argparse
import sqlite3
import sys
import tempfile
import time
from pathlib import Path
from typing import Dict, List
# Check dependencies
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.llm_enhancer import (
LLMEnhancer,
LLMConfig,
FileData,
EnhancedSemanticIndexer,
)
from codexlens.storage.dir_index import DirIndexStore
from codexlens.search.hybrid_search import HybridSearchEngine
except ImportError as e:
print(f"Error: Missing dependencies - {e}")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
if not SEMANTIC_AVAILABLE:
print("Error: Semantic search dependencies not available")
sys.exit(1)
# Test dataset with MISLEADING or MISSING comments
MISLEADING_DATASET = {
"crypto/hasher.py": '''"""Simple string utilities."""
import bcrypt
def process_string(s: str, rounds: int = 12) -> str:
"""Convert string to uppercase."""
salt = bcrypt.gensalt(rounds=rounds)
hashed = bcrypt.hashpw(s.encode('utf-8'), salt)
return hashed.decode('utf-8')
def check_string(s: str, target: str) -> bool:
"""Check if two strings are equal."""
return bcrypt.checkpw(s.encode('utf-8'), target.encode('utf-8'))
''',
"auth/token.py": '''import jwt
from datetime import datetime, timedelta
SECRET_KEY = "key123"
def make_thing(uid: int, exp: int = 3600) -> str:
payload = {
'user_id': uid,
'exp': datetime.utcnow() + timedelta(seconds=exp),
'iat': datetime.utcnow()
}
return jwt.encode(payload, SECRET_KEY, algorithm='HS256')
def parse_thing(thing: str) -> dict:
try:
return jwt.decode(thing, SECRET_KEY, algorithms=['HS256'])
except jwt.ExpiredSignatureError:
return None
''',
"api/handlers.py": '''"""Database connection utilities."""
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/api/items', methods=['POST'])
def create_item():
"""Delete an existing item."""
data = request.get_json()
if not data.get('email') or not data.get('password'):
return jsonify({'error': 'Missing data'}), 400
item_id = 12345
return jsonify({'item_id': item_id, 'success': True}), 201
@app.route('/api/items/<int:item_id>', methods=['GET'])
def get_item(item_id: int):
"""Update item configuration."""
item = {
'id': item_id,
'email': 'user@example.com',
'name': 'John Doe'
}
return jsonify(item), 200
''',
"utils/checker.py": '''"""Math calculation functions."""
import re
def calc_sum(email: str) -> bool:
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def format_text(text: str, max_len: int = 255) -> str:
text = re.sub(r'[<>"\\'&]', '', text)
return text.strip()[:max_len]
''',
"db/pool.py": '''"""Email sending service."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class EmailSender:
"""SMTP email sender with retry logic."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize email sender."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def send_email(self):
"""Send email message."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
''',
}
# Test queries - natural language based on ACTUAL functionality (not misleading comments)
TEST_QUERIES = [
("How to hash passwords securely with bcrypt?", "crypto/hasher.py"),
("Generate JWT authentication token", "auth/token.py"),
("Create user account REST API endpoint", "api/handlers.py"),
("Validate email address format", "utils/checker.py"),
("PostgreSQL database connection pool", "db/pool.py"),
]
def create_test_database(db_path: Path) -> None:
"""Create and populate test database."""
store = DirIndexStore(db_path)
store.initialize()
with store._get_connection() as conn:
for path, content in MISLEADING_DATASET.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
store.close()
def test_pure_vector_search(db_path: Path) -> Dict:
"""Test pure vector search (relies on code + misleading comments)."""
print("\n" + "="*70)
print("PURE VECTOR SEARCH (Code + Misleading Comments -> fastembed)")
print("="*70)
start_time = time.time()
# Generate pure vector embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
chunk_count = 0
for row in rows:
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
chunk.metadata["strategy"] = "pure_vector"
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
chunk_count += len(chunks)
setup_time = time.time() - start_time
print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s")
print("Note: Embeddings include misleading comments")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict:
"""Test LLM-enhanced search (LLM reads code and generates accurate summary)."""
print("\n" + "="*70)
print(f"LLM-ENHANCED SEARCH (Code -> {llm_tool.upper()} Analysis -> fastembed)")
print("="*70)
# Check CCW availability
llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2)
enhancer = LLMEnhancer(llm_config)
if not enhancer.check_available():
print("[X] CCW CLI not available - skipping LLM-enhanced test")
print(" Install CCW: npm install -g ccw")
return {}
start_time = time.time()
# Generate LLM-enhanced embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)
# Prepare file data
file_data_list = [
FileData(path=path, content=content, language="python")
for path, content in MISLEADING_DATASET.items()
]
# Index with LLM enhancement
print(f"LLM analyzing code (ignoring misleading comments)...")
indexed = indexer.index_files(file_data_list)
setup_time = time.time() - start_time
print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s")
print("Note: LLM generates summaries based on actual code logic")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def compare_results(pure_results: Dict, llm_results: Dict) -> None:
"""Compare and analyze results from both approaches."""
print("\n" + "="*70)
print("COMPARISON SUMMARY - MISLEADING COMMENTS TEST")
print("="*70)
if not llm_results:
print("Cannot compare - LLM-enhanced test was skipped")
return
pure_score = 0
llm_score = 0
print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
pure_res = pure_results.get(query, {})
llm_res = llm_results.get(query, {})
pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss"
llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss"
# Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point
if pure_res.get('found') and pure_res.get('rank'):
pure_score += max(0, 4 - pure_res['rank'])
if llm_res.get('found') and llm_res.get('rank'):
llm_score += max(0, 4 - llm_res['rank'])
display_query = query[:42] + "..." if len(query) > 45 else query
print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}")
print("-" * 70)
print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}")
print("="*70)
# Analysis
print("\nANALYSIS:")
if llm_score > pure_score:
improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
print(" LLM understands actual code logic despite misleading comments")
print(" Pure vector search misled by incorrect documentation")
elif pure_score > llm_score:
degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100
print(f"[X] Pure vector performed {degradation:.1f}% better")
print(" Unexpected: Pure vector wasn't affected by misleading comments")
else:
print("= Both approaches performed equally")
print(" Test dataset may still be too simple to show differences")
print("\nKEY INSIGHTS:")
print("- Pure Vector: Embeds code + comments together, can be misled")
print("- LLM Enhanced: Analyzes actual code behavior, ignores bad comments")
print("- Best Use: LLM enhancement crucial for poorly documented codebases")
print("\nMISLEADING COMMENTS IN TEST:")
print("1. 'hasher.py' claims 'string utilities' but does bcrypt hashing")
print("2. 'token.py' has no docstrings, unclear function names")
print("3. 'handlers.py' says 'database utilities' but is REST API")
print("4. 'handlers.py' docstrings opposite (create says delete, etc)")
print("5. 'checker.py' claims 'math functions' but validates emails")
print("6. 'pool.py' claims 'email sender' but is database pool")
def main():
parser = argparse.ArgumentParser(
description="Test pure vector vs LLM-enhanced with misleading comments"
)
parser.add_argument(
"--tool",
choices=["gemini", "qwen"],
default="gemini",
help="LLM tool to use (default: gemini)"
)
parser.add_argument(
"--skip-llm",
action="store_true",
help="Skip LLM-enhanced test"
)
parser.add_argument(
"--keep-db",
type=str,
help="Save database to specified path for inspection (e.g., ./test_results.db)"
)
args = parser.parse_args()
print("\n" + "="*70)
print("MISLEADING COMMENTS TEST")
print("Pure Vector vs LLM-Enhanced with Incorrect Documentation")
print("="*70)
# Create test database
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
try:
print(f"\nTest dataset: {len(MISLEADING_DATASET)} Python files")
print(f"Test queries: {len(TEST_QUERIES)} natural language questions")
print("\nChallenges:")
print("- Misleading module docstrings")
print("- Incorrect function docstrings")
print("- Missing documentation")
print("- Unclear function names")
create_test_database(db_path)
# Test pure vector search
pure_results = test_pure_vector_search(db_path)
# Test LLM-enhanced search
if not args.skip_llm:
# Clear semantic_chunks table for LLM test
with sqlite3.connect(db_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
llm_results = test_llm_enhanced_search(db_path, args.tool)
else:
llm_results = {}
print("\n[X] LLM-enhanced test skipped (--skip-llm flag)")
# Compare results
compare_results(pure_results, llm_results)
finally:
# Save or cleanup database
if args.keep_db:
import shutil
save_path = Path(args.keep_db)
try:
import gc
gc.collect()
time.sleep(0.2)
shutil.copy2(db_path, save_path)
print(f"\n[OK] Database saved to: {save_path}")
print(f"Inspect with: python scripts/inspect_llm_summaries.py {save_path}")
except Exception as e:
print(f"\n[X] Failed to save database: {e}")
finally:
try:
if db_path.exists():
db_path.unlink()
except:
pass
else:
# Cleanup
try:
import gc
gc.collect()
time.sleep(0.1)
if db_path.exists():
db_path.unlink()
except PermissionError:
print(f"\nWarning: Could not delete temporary database: {db_path}")
print("\n" + "="*70)
print("Test completed!")
print("="*70)
if __name__ == "__main__":
main()