"""Comprehensive comparison test for vector search vs hybrid search. This test diagnoses why vector search returns empty results and compares performance between different search modes. """ import json import sqlite3 import tempfile import time from pathlib import Path from typing import Dict, List, Any import pytest from codexlens.entities import SearchResult from codexlens.search.hybrid_search import HybridSearchEngine from codexlens.storage.dir_index import DirIndexStore # Check semantic search availability try: from codexlens.semantic.embedder import Embedder from codexlens.semantic.vector_store import VectorStore from codexlens.semantic import SEMANTIC_AVAILABLE SEMANTIC_DEPS_AVAILABLE = SEMANTIC_AVAILABLE except ImportError: SEMANTIC_DEPS_AVAILABLE = False class TestSearchComparison: """Comprehensive comparison of search modes.""" @pytest.fixture def sample_project_db(self): """Create sample project database with semantic chunks.""" with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: db_path = Path(tmpdir) / "_index.db" store = DirIndexStore(db_path) store.initialize() # Sample files with varied content for testing sample_files = { "src/auth/authentication.py": """ def authenticate_user(username: str, password: str) -> bool: '''Authenticate user with credentials using bcrypt hashing. This function validates user credentials against the database and returns True if authentication succeeds. ''' hashed = hash_password(password) return verify_credentials(username, hashed) def hash_password(password: str) -> str: '''Hash password using bcrypt algorithm.''' import bcrypt return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode() def verify_credentials(user: str, pwd_hash: str) -> bool: '''Verify user credentials against database.''' # Database verification logic return True """, "src/auth/authorization.py": """ def authorize_action(user_id: int, resource: str, action: str) -> bool: '''Authorize user action on resource using role-based access control. Checks if user has permission to perform action on resource based on their assigned roles. ''' roles = get_user_roles(user_id) permissions = get_role_permissions(roles) return has_permission(permissions, resource, action) def get_user_roles(user_id: int) -> List[str]: '''Fetch user roles from database.''' return ["user", "admin"] def has_permission(permissions, resource, action) -> bool: '''Check if permissions allow action on resource.''' return True """, "src/models/user.py": """ from dataclasses import dataclass from typing import Optional @dataclass class User: '''User model representing application users. Stores user profile information and authentication state. ''' id: int username: str email: str password_hash: str is_active: bool = True def authenticate(self, password: str) -> bool: '''Authenticate this user with password.''' from auth.authentication import verify_credentials return verify_credentials(self.username, password) def has_role(self, role: str) -> bool: '''Check if user has specific role.''' return True """, "src/api/user_api.py": """ from flask import Flask, request, jsonify from models.user import User app = Flask(__name__) @app.route('/api/user/', methods=['GET']) def get_user(user_id: int): '''Get user by ID from database. Returns user profile information as JSON. ''' user = User.query.get(user_id) return jsonify(user.to_dict()) @app.route('/api/user/login', methods=['POST']) def login(): '''User login endpoint using username and password. Authenticates user and returns session token. ''' data = request.json username = data.get('username') password = data.get('password') if authenticate_user(username, password): token = generate_session_token(username) return jsonify({'token': token}) return jsonify({'error': 'Invalid credentials'}), 401 """, "tests/test_auth.py": """ import pytest from auth.authentication import authenticate_user, hash_password class TestAuthentication: '''Test authentication functionality.''' def test_authenticate_valid_user(self): '''Test authentication with valid credentials.''' assert authenticate_user("testuser", "password123") == True def test_authenticate_invalid_user(self): '''Test authentication with invalid credentials.''' assert authenticate_user("invalid", "wrong") == False def test_password_hashing(self): '''Test password hashing produces unique hashes.''' hash1 = hash_password("password") hash2 = hash_password("password") assert hash1 != hash2 # Salts should differ """, } # Insert files into database with store._get_connection() as conn: for file_path, content in sample_files.items(): name = file_path.split('/')[-1] lang = "python" conn.execute( """INSERT INTO files (name, full_path, content, language, mtime) VALUES (?, ?, ?, ?, ?)""", (name, file_path, content, lang, time.time()) ) conn.commit() yield db_path store.close() def _check_semantic_chunks_table(self, db_path: Path) -> Dict[str, Any]: """Check if semantic_chunks table exists and has data.""" with sqlite3.connect(db_path) as conn: cursor = conn.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" ) table_exists = cursor.fetchone() is not None chunk_count = 0 if table_exists: cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") chunk_count = cursor.fetchone()[0] return { "table_exists": table_exists, "chunk_count": chunk_count, } def _create_vector_index(self, db_path: Path) -> Dict[str, Any]: """Create vector embeddings for indexed files.""" if not SEMANTIC_DEPS_AVAILABLE: return { "success": False, "error": "Semantic dependencies not available", "chunks_created": 0, } try: from codexlens.semantic.chunker import Chunker, ChunkConfig # Initialize embedder and vector store embedder = Embedder(profile="code") vector_store = VectorStore(db_path) chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) # Read files from database with sqlite3.connect(db_path) as conn: conn.row_factory = sqlite3.Row cursor = conn.execute("SELECT full_path, content FROM files") files = cursor.fetchall() chunks_created = 0 for file_row in files: file_path = file_row["full_path"] content = file_row["content"] # Create semantic chunks using sliding window chunks = chunker.chunk_sliding_window( content, file_path=file_path, language="python" ) # Generate embeddings for chunk in chunks: embedding = embedder.embed_single(chunk.content) chunk.embedding = embedding # Store chunks if chunks: # Only store if we have chunks vector_store.add_chunks(chunks, file_path) chunks_created += len(chunks) return { "success": True, "chunks_created": chunks_created, "files_processed": len(files), } except Exception as exc: return { "success": False, "error": str(exc), "chunks_created": 0, } def _run_search_mode( self, db_path: Path, query: str, mode: str, limit: int = 10, ) -> Dict[str, Any]: """Run search in specified mode and collect metrics.""" engine = HybridSearchEngine() # Map mode to parameters pure_vector = False if mode == "exact": enable_fuzzy, enable_vector = False, False elif mode == "fuzzy": enable_fuzzy, enable_vector = True, False elif mode == "vector": enable_fuzzy, enable_vector = False, True pure_vector = True # Use pure vector mode for vector-only search elif mode == "hybrid": enable_fuzzy, enable_vector = True, True else: raise ValueError(f"Invalid mode: {mode}") # Measure search time start_time = time.time() try: results = engine.search( db_path, query, limit=limit, enable_fuzzy=enable_fuzzy, enable_vector=enable_vector, pure_vector=pure_vector, ) elapsed_ms = (time.time() - start_time) * 1000 return { "success": True, "mode": mode, "query": query, "result_count": len(results), "elapsed_ms": elapsed_ms, "results": [ { "path": r.path, "score": r.score, "excerpt": r.excerpt[:100] if r.excerpt else "", "source": getattr(r, "search_source", None), } for r in results[:5] # Top 5 results ], } except Exception as exc: elapsed_ms = (time.time() - start_time) * 1000 return { "success": False, "mode": mode, "query": query, "error": str(exc), "elapsed_ms": elapsed_ms, "result_count": 0, } @pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available") def test_full_search_comparison_with_vectors(self, sample_project_db): """Complete search comparison test with vector embeddings.""" db_path = sample_project_db # Step 1: Check initial state print("\n=== Step 1: Checking initial database state ===") initial_state = self._check_semantic_chunks_table(db_path) print(f"Table exists: {initial_state['table_exists']}") print(f"Chunk count: {initial_state['chunk_count']}") # Step 2: Create vector index print("\n=== Step 2: Creating vector embeddings ===") vector_result = self._create_vector_index(db_path) print(f"Success: {vector_result['success']}") if vector_result['success']: print(f"Chunks created: {vector_result['chunks_created']}") print(f"Files processed: {vector_result['files_processed']}") else: print(f"Error: {vector_result.get('error', 'Unknown')}") # Step 3: Verify vector index was created print("\n=== Step 3: Verifying vector index ===") final_state = self._check_semantic_chunks_table(db_path) print(f"Table exists: {final_state['table_exists']}") print(f"Chunk count: {final_state['chunk_count']}") # Step 4: Run comparison tests print("\n=== Step 4: Running search mode comparison ===") test_queries = [ "authenticate user credentials", # Semantic query "authentication", # Keyword query "password hashing bcrypt", # Multi-term query ] comparison_results = [] for query in test_queries: print(f"\n--- Query: '{query}' ---") for mode in ["exact", "fuzzy", "vector", "hybrid"]: result = self._run_search_mode(db_path, query, mode, limit=10) comparison_results.append(result) print(f"\n{mode.upper()} mode:") print(f" Success: {result['success']}") print(f" Results: {result['result_count']}") print(f" Time: {result['elapsed_ms']:.2f}ms") if result['success'] and result['result_count'] > 0: print(f" Top result: {result['results'][0]['path']}") print(f" Score: {result['results'][0]['score']:.3f}") print(f" Source: {result['results'][0]['source']}") elif not result['success']: print(f" Error: {result.get('error', 'Unknown')}") # Step 5: Generate comparison report print("\n=== Step 5: Comparison Summary ===") # Group by mode mode_stats = {} for result in comparison_results: mode = result['mode'] if mode not in mode_stats: mode_stats[mode] = { "total_searches": 0, "successful_searches": 0, "total_results": 0, "total_time_ms": 0, "empty_results": 0, } stats = mode_stats[mode] stats["total_searches"] += 1 if result['success']: stats["successful_searches"] += 1 stats["total_results"] += result['result_count'] if result['result_count'] == 0: stats["empty_results"] += 1 stats["total_time_ms"] += result['elapsed_ms'] # Print summary table print("\nMode | Queries | Success | Avg Results | Avg Time | Empty Results") print("-" * 75) for mode in ["exact", "fuzzy", "vector", "hybrid"]: if mode in mode_stats: stats = mode_stats[mode] avg_results = stats["total_results"] / stats["total_searches"] avg_time = stats["total_time_ms"] / stats["total_searches"] print( f"{mode:9} | {stats['total_searches']:7} | " f"{stats['successful_searches']:7} | {avg_results:11.1f} | " f"{avg_time:8.1f}ms | {stats['empty_results']:13}" ) # Assertions assert initial_state is not None if vector_result['success']: assert final_state['chunk_count'] > 0, "Vector index should contain chunks" # Find vector search results vector_results = [r for r in comparison_results if r['mode'] == 'vector'] if vector_results: # At least one vector search should return results if index was created has_vector_results = any(r.get('result_count', 0) > 0 for r in vector_results) if not has_vector_results: print("\n⚠️ WARNING: Vector index created but vector search returned no results!") print("This indicates a potential issue with vector search implementation.") def test_search_comparison_without_vectors(self, sample_project_db): """Search comparison test without vector embeddings (baseline).""" db_path = sample_project_db print("\n=== Testing search without vector embeddings ===") # Check state state = self._check_semantic_chunks_table(db_path) print(f"Semantic chunks table exists: {state['table_exists']}") print(f"Chunk count: {state['chunk_count']}") # Run exact and fuzzy searches only test_queries = ["authentication", "user password", "bcrypt hash"] for query in test_queries: print(f"\n--- Query: '{query}' ---") for mode in ["exact", "fuzzy"]: result = self._run_search_mode(db_path, query, mode, limit=10) print(f"{mode.upper()}: {result['result_count']} results in {result['elapsed_ms']:.2f}ms") if result['success'] and result['result_count'] > 0: print(f" Top: {result['results'][0]['path']} (score: {result['results'][0]['score']:.3f})") # Test vector search without embeddings (should return empty) print(f"\n--- Testing vector search without embeddings ---") vector_result = self._run_search_mode(db_path, "authentication", "vector", limit=10) print(f"Vector search result count: {vector_result['result_count']}") print(f"This is expected to be 0 without embeddings: {vector_result['result_count'] == 0}") assert vector_result['result_count'] == 0, \ "Vector search should return empty results when no embeddings exist" class TestDiagnostics: """Diagnostic tests to identify specific issues.""" @pytest.fixture def empty_db(self): """Create empty database.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = Path(f.name) store = DirIndexStore(db_path) store.initialize() store.close() yield db_path if db_path.exists(): for attempt in range(5): try: db_path.unlink() break except PermissionError: time.sleep(0.05 * (attempt + 1)) else: # Best-effort cleanup (Windows SQLite locks can linger briefly). try: db_path.unlink(missing_ok=True) except (PermissionError, OSError): pass def test_diagnose_empty_database(self, empty_db): """Diagnose behavior with empty database.""" engine = HybridSearchEngine() print("\n=== Diagnosing empty database ===") # Test all modes for mode_config in [ ("exact", False, False), ("fuzzy", True, False), ("vector", False, True), ("hybrid", True, True), ]: mode, enable_fuzzy, enable_vector = mode_config try: results = engine.search( empty_db, "test", limit=10, enable_fuzzy=enable_fuzzy, enable_vector=enable_vector, ) print(f"{mode}: {len(results)} results (OK)") assert isinstance(results, list) assert len(results) == 0 except Exception as exc: print(f"{mode}: ERROR - {exc}") # Should not raise errors, should return empty list pytest.fail(f"Search mode '{mode}' raised exception on empty database: {exc}") @pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available") def test_diagnose_embedder_initialization(self): """Test embedder initialization and embedding generation.""" print("\n=== Diagnosing embedder ===") try: embedder = Embedder(profile="code") print(f"✓ Embedder initialized (model: {embedder.model_name})") print(f" Embedding dimension: {embedder.embedding_dim}") # Test embedding generation test_text = "def authenticate_user(username, password):" embedding = embedder.embed_single(test_text) print(f"✓ Generated embedding (length: {len(embedding)})") print(f" Sample values: {embedding[:5]}") assert len(embedding) == embedder.embedding_dim assert all(isinstance(v, float) for v in embedding) except Exception as exc: print(f"✗ Embedder error: {exc}") raise if __name__ == "__main__": # Run tests with pytest pytest.main([__file__, "-v", "-s"])