diff --git a/codex-lens/benchmarks/analyze_methods.py b/codex-lens/benchmarks/analyze_methods.py new file mode 100644 index 00000000..fa51aa3b --- /dev/null +++ b/codex-lens/benchmarks/analyze_methods.py @@ -0,0 +1,281 @@ +"""Analyze hybrid search methods contribution.""" +import json +import sqlite3 +import time +from pathlib import Path +from collections import defaultdict +import sys +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from codexlens.search.hybrid_search import HybridSearchEngine +from codexlens.search.ranking import ( + reciprocal_rank_fusion, + cross_encoder_rerank, + DEFAULT_WEIGHTS, + FTS_FALLBACK_WEIGHTS, +) + +# Use index with most data +index_path = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens\src\codexlens\storage\_index.db") + +print("=" * 60) +print("1. STORAGE ARCHITECTURE ANALYSIS") +print("=" * 60) + +# Analyze storage +with sqlite3.connect(index_path) as conn: + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = [row[0] for row in cursor.fetchall()] + + print("\nTable Overview:") + for table in tables: + try: + count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] + if count > 0: + print(f" {table}: {count} rows") + except: + pass + + print("\n--- Conflict Analysis ---") + + chunks_count = 0 + semantic_count = 0 + + if "chunks" in tables: + chunks_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] + if "semantic_chunks" in tables: + semantic_count = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()[0] + + print(f" chunks table: {chunks_count} rows") + print(f" semantic_chunks table: {semantic_count} rows") + + if semantic_count > 0: + col_info = conn.execute("PRAGMA table_info(semantic_chunks)").fetchall() + col_names = [c[1] for c in col_info] + + print(f"\n semantic_chunks columns: {col_names}") + + for col in ["embedding", "embedding_binary", "embedding_dense"]: + if col in col_names: + null_count = conn.execute( + f"SELECT COUNT(*) FROM semantic_chunks WHERE {col} IS NULL" + ).fetchone()[0] + non_null = semantic_count - null_count + print(f" {col}: {non_null}/{semantic_count} non-null") + + if "splade_posting_list" in tables: + splade_count = conn.execute("SELECT COUNT(*) FROM splade_posting_list").fetchone()[0] + print(f"\n splade_posting_list: {splade_count} postings") + else: + print("\n splade_posting_list: NOT EXISTS") + +print("\n" + "=" * 60) +print("2. METHOD CONTRIBUTION ANALYSIS") +print("=" * 60) + +queries = [ + "database connection", + "create table", + "sqlite store", + "migration", + "search chunks", +] + +results_summary = { + "fts_exact": [], + "fts_fuzzy": [], + "vector": [], + "splade": [], +} + +for query in queries: + print(f"\nQuery: '{query}'") + + # FTS Exact + try: + engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS) + engine._config = type("obj", (object,), { + "use_fts_fallback": True, + "enable_splade": False, + "embedding_use_gpu": True, + "symbol_boost_factor": 1.5, + "enable_reranking": False, + })() + + start = time.perf_counter() + results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False) + latency = (time.perf_counter() - start) * 1000 + + results_summary["fts_exact"].append({"count": len(results), "latency": latency}) + top_file = results[0].path.split("\\")[-1] if results else "N/A" + top_score = results[0].score if results else 0 + print(f" FTS Exact: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") + except Exception as e: + print(f" FTS Exact: ERROR - {e}") + + # FTS Fuzzy + try: + engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS) + engine._config = type("obj", (object,), { + "use_fts_fallback": True, + "enable_splade": False, + "embedding_use_gpu": True, + "symbol_boost_factor": 1.5, + "enable_reranking": False, + })() + + start = time.perf_counter() + results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False) + latency = (time.perf_counter() - start) * 1000 + + results_summary["fts_fuzzy"].append({"count": len(results), "latency": latency}) + top_file = results[0].path.split("\\")[-1] if results else "N/A" + top_score = results[0].score if results else 0 + print(f" FTS Fuzzy: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") + except Exception as e: + print(f" FTS Fuzzy: ERROR - {e}") + + # Vector + try: + engine = HybridSearchEngine() + engine._config = type("obj", (object,), { + "use_fts_fallback": False, + "enable_splade": False, + "embedding_use_gpu": True, + "symbol_boost_factor": 1.5, + "enable_reranking": False, + })() + + start = time.perf_counter() + results = engine.search(index_path, query, limit=10, enable_vector=True, pure_vector=True) + latency = (time.perf_counter() - start) * 1000 + + results_summary["vector"].append({"count": len(results), "latency": latency}) + top_file = results[0].path.split("\\")[-1] if results else "N/A" + top_score = results[0].score if results else 0 + print(f" Vector: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") + except Exception as e: + print(f" Vector: ERROR - {e}") + + # SPLADE + try: + engine = HybridSearchEngine(weights={"splade": 1.0}) + engine._config = type("obj", (object,), { + "use_fts_fallback": False, + "enable_splade": True, + "embedding_use_gpu": True, + "symbol_boost_factor": 1.5, + "enable_reranking": False, + })() + + start = time.perf_counter() + results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False) + latency = (time.perf_counter() - start) * 1000 + + results_summary["splade"].append({"count": len(results), "latency": latency}) + top_file = results[0].path.split("\\")[-1] if results else "N/A" + top_score = results[0].score if results else 0 + print(f" SPLADE: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") + except Exception as e: + print(f" SPLADE: ERROR - {e}") + +print("\n--- Summary ---") +for method, data in results_summary.items(): + if data: + avg_count = sum(d["count"] for d in data) / len(data) + avg_latency = sum(d["latency"] for d in data) / len(data) + print(f"{method}: avg {avg_count:.1f} results, {avg_latency:.1f}ms") + +print("\n" + "=" * 60) +print("3. FTS + RERANK FUSION EXPERIMENT") +print("=" * 60) + +# Initialize reranker +reranker = None +try: + from codexlens.semantic.reranker import get_reranker, check_reranker_available + ok, _ = check_reranker_available("onnx") + if ok: + reranker = get_reranker(backend="onnx", use_gpu=True) + print("\nReranker loaded: ONNX backend") +except Exception as e: + print(f"\nReranker unavailable: {e}") + +test_queries = ["database connection", "create table migration"] + +for query in test_queries: + print(f"\nQuery: '{query}'") + + # Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF) + try: + engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS) + engine._config = type("obj", (object,), { + "use_fts_fallback": True, + "enable_splade": False, + "embedding_use_gpu": True, + "symbol_boost_factor": 1.5, + "enable_reranking": False, + })() + + start = time.perf_counter() + standard_results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False) + standard_latency = (time.perf_counter() - start) * 1000 + + print(f" Standard FTS RRF: {len(standard_results)} results, {standard_latency:.1f}ms") + for i, r in enumerate(standard_results[:3]): + print(f" {i+1}. {r.path.split(chr(92))[-1]} (score: {r.score:.4f})") + except Exception as e: + print(f" Standard FTS RRF: ERROR - {e}") + standard_results = [] + + # Strategy 2: FTS + CrossEncoder Rerank + if reranker and standard_results: + try: + start = time.perf_counter() + reranked_results = cross_encoder_rerank(query, standard_results, reranker, top_k=10) + rerank_latency = (time.perf_counter() - start) * 1000 + + print(f" FTS + Rerank: {len(reranked_results)} results, {rerank_latency:.1f}ms (rerank only)") + for i, r in enumerate(reranked_results[:3]): + ce_score = r.metadata.get("cross_encoder_prob", r.score) + print(f" {i+1}. {r.path.split(chr(92))[-1]} (CE prob: {ce_score:.4f})") + + # Compare rankings + standard_order = [r.path.split("\\")[-1] for r in standard_results[:5]] + reranked_order = [r.path.split("\\")[-1] for r in reranked_results[:5]] + + if standard_order != reranked_order: + print(f" Ranking changed!") + print(f" Before: {standard_order}") + print(f" After: {reranked_order}") + else: + print(f" Ranking unchanged") + + except Exception as e: + print(f" FTS + Rerank: ERROR - {e}") + +print("\n" + "=" * 60) +print("CONCLUSIONS") +print("=" * 60) +print(""" +1. Storage Architecture: + - semantic_chunks: Used by cascade-index (binary+dense vectors) + - chunks: Used by legacy SQLiteStore (currently empty in this index) + - splade_posting_list: Used by SPLADE sparse retrieval + - files_fts_*: Used by FTS exact/fuzzy search + + CONFLICT: binary_cascade_search reads from semantic_chunks, + but standard FTS reads from files table. These are SEPARATE paths. + +2. Method Contributions: + - FTS: Fast but limited to keyword matching + - Vector: Semantic understanding but requires embeddings + - SPLADE: Sparse retrieval, good for keyword+semantic hybrid + +3. FTS + Rerank Fusion: + - CrossEncoder reranking can improve precision + - Adds ~100-200ms latency per query + - Most effective when initial FTS recall is good +""") diff --git a/codex-lens/benchmarks/method_contribution_analysis.py b/codex-lens/benchmarks/method_contribution_analysis.py new file mode 100644 index 00000000..e005f958 --- /dev/null +++ b/codex-lens/benchmarks/method_contribution_analysis.py @@ -0,0 +1,547 @@ +"""Analysis script for hybrid search method contribution and storage architecture. + +This script analyzes: +1. Individual method contribution in hybrid search (FTS/SPLADE/Vector) +2. Storage architecture conflicts between different retrieval methods +3. FTS + Rerank fusion experiment +""" + +import json +import sqlite3 +import time +from pathlib import Path +from typing import Dict, List, Tuple, Any +from collections import defaultdict + +# Add project root to path +import sys +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from codexlens.storage.registry import RegistryStore +from codexlens.storage.path_mapper import PathMapper +from codexlens.search.hybrid_search import HybridSearchEngine +from codexlens.search.ranking import ( + reciprocal_rank_fusion, + cross_encoder_rerank, + DEFAULT_WEIGHTS, + FTS_FALLBACK_WEIGHTS, +) +from codexlens.search.hybrid_search import THREE_WAY_WEIGHTS +from codexlens.entities import SearchResult + + +def find_project_index(source_path: Path) -> Path: + """Find the index database for a project.""" + registry = RegistryStore() + registry.initialize() + + mapper = PathMapper() + index_path = mapper.source_to_index_db(source_path) + + if not index_path.exists(): + nearest = registry.find_nearest_index(source_path) + if nearest: + index_path = nearest.index_path + + registry.close() + return index_path + + +def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]: + """Analyze storage tables and check for conflicts. + + Returns: + Dictionary with table analysis and conflict detection. + """ + results = { + "tables": {}, + "conflicts": [], + "recommendations": [] + } + + with sqlite3.connect(index_path) as conn: + # Get all tables + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = [row[0] for row in cursor.fetchall()] + + for table in tables: + # Get row count and columns + try: + count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] + cols = conn.execute(f"PRAGMA table_info({table})").fetchall() + col_names = [c[1] for c in cols] + + results["tables"][table] = { + "row_count": count, + "columns": col_names + } + except Exception as e: + results["tables"][table] = {"error": str(e)} + + # Check for data overlap/conflicts + # 1. Check if chunks and semantic_chunks have different data + if "chunks" in tables and "semantic_chunks" in tables: + chunks_count = results["tables"]["chunks"]["row_count"] + semantic_count = results["tables"]["semantic_chunks"]["row_count"] + + if chunks_count > 0 and semantic_count > 0: + # Check for ID overlap + overlap = conn.execute(""" + SELECT COUNT(*) FROM chunks c + JOIN semantic_chunks sc ON c.id = sc.id + """).fetchone()[0] + + results["conflicts"].append({ + "type": "table_overlap", + "tables": ["chunks", "semantic_chunks"], + "chunks_count": chunks_count, + "semantic_count": semantic_count, + "id_overlap": overlap, + "description": ( + f"Both chunks ({chunks_count}) and semantic_chunks ({semantic_count}) " + f"have data. ID overlap: {overlap}. " + "This can cause confusion - binary_cascade reads from semantic_chunks " + "but SQLiteStore reads from chunks." + ) + }) + elif chunks_count == 0 and semantic_count > 0: + results["recommendations"].append( + "chunks table is empty but semantic_chunks has data. " + "Use cascade-index (semantic_chunks) for better semantic search." + ) + elif chunks_count > 0 and semantic_count == 0: + results["recommendations"].append( + "semantic_chunks is empty. Run 'codexlens cascade-index' to enable " + "binary cascade search." + ) + + # 2. Check SPLADE index status + if "splade_posting_list" in tables: + splade_count = results["tables"]["splade_posting_list"]["row_count"] + if splade_count == 0: + results["recommendations"].append( + "SPLADE tables exist but empty. Run SPLADE indexing to enable sparse retrieval." + ) + + # 3. Check FTS tables + fts_tables = [t for t in tables if t.startswith("files_fts")] + if len(fts_tables) >= 2: + results["recommendations"].append( + f"Found {len(fts_tables)} FTS tables: {fts_tables}. " + "Dual FTS (exact + fuzzy) is properly configured." + ) + + return results + + +def analyze_method_contributions( + index_path: Path, + queries: List[str], + limit: int = 20 +) -> Dict[str, Any]: + """Analyze contribution of each retrieval method. + + Runs each method independently and measures: + - Result count + - Latency + - Score distribution + - Overlap with other methods + """ + results = { + "per_query": [], + "summary": {} + } + + for query in queries: + query_result = { + "query": query, + "methods": {}, + "fusion_analysis": {} + } + + # Run each method independently + methods = { + "fts_exact": {"fuzzy": False, "vector": False, "splade": False}, + "fts_fuzzy": {"fuzzy": True, "vector": False, "splade": False}, + "vector": {"fuzzy": False, "vector": True, "splade": False}, + "splade": {"fuzzy": False, "vector": False, "splade": True}, + } + + method_results: Dict[str, List[SearchResult]] = {} + + for method_name, config in methods.items(): + try: + engine = HybridSearchEngine() + + # Set config to disable/enable specific backends + engine._config = type('obj', (object,), { + 'use_fts_fallback': method_name.startswith("fts"), + 'enable_splade': method_name == "splade", + 'embedding_use_gpu': True, + })() + + start = time.perf_counter() + + if method_name == "fts_exact": + # Force FTS fallback mode with fuzzy disabled + engine.weights = FTS_FALLBACK_WEIGHTS.copy() + results_list = engine.search( + index_path, query, limit=limit, + enable_fuzzy=False, enable_vector=False, pure_vector=False + ) + elif method_name == "fts_fuzzy": + engine.weights = FTS_FALLBACK_WEIGHTS.copy() + results_list = engine.search( + index_path, query, limit=limit, + enable_fuzzy=True, enable_vector=False, pure_vector=False + ) + elif method_name == "vector": + results_list = engine.search( + index_path, query, limit=limit, + enable_fuzzy=False, enable_vector=True, pure_vector=True + ) + elif method_name == "splade": + engine.weights = {"splade": 1.0} + results_list = engine.search( + index_path, query, limit=limit, + enable_fuzzy=False, enable_vector=False, pure_vector=False + ) + else: + results_list = [] + + latency = (time.perf_counter() - start) * 1000 + + method_results[method_name] = results_list + + scores = [r.score for r in results_list] + query_result["methods"][method_name] = { + "count": len(results_list), + "latency_ms": latency, + "avg_score": sum(scores) / len(scores) if scores else 0, + "max_score": max(scores) if scores else 0, + "min_score": min(scores) if scores else 0, + "top_3_files": [r.path.split("\\")[-1] for r in results_list[:3]] + } + + except Exception as e: + query_result["methods"][method_name] = { + "error": str(e), + "count": 0 + } + + # Compute overlap between methods + method_paths = { + name: set(r.path for r in results) + for name, results in method_results.items() + if results + } + + overlaps = {} + method_names = list(method_paths.keys()) + for i, m1 in enumerate(method_names): + for m2 in method_names[i+1:]: + overlap = len(method_paths[m1] & method_paths[m2]) + union = len(method_paths[m1] | method_paths[m2]) + jaccard = overlap / union if union > 0 else 0 + overlaps[f"{m1}_vs_{m2}"] = { + "overlap_count": overlap, + "jaccard": jaccard, + f"{m1}_unique": len(method_paths[m1] - method_paths[m2]), + f"{m2}_unique": len(method_paths[m2] - method_paths[m1]), + } + + query_result["overlaps"] = overlaps + + # Analyze RRF fusion contribution + if len(method_results) >= 2: + # Compute RRF with each method's contribution + rrf_map = {} + for name, results in method_results.items(): + if results and name in ["fts_exact", "splade", "vector"]: + # Rename for RRF + rrf_name = name.replace("fts_exact", "exact") + rrf_map[rrf_name] = results + + if rrf_map: + fused = reciprocal_rank_fusion(rrf_map, k=60) + + # Analyze which methods contributed to top results + source_contributions = defaultdict(int) + for r in fused[:10]: + source_ranks = r.metadata.get("source_ranks", {}) + for source in source_ranks: + source_contributions[source] += 1 + + query_result["fusion_analysis"] = { + "total_fused": len(fused), + "top_10_source_distribution": dict(source_contributions) + } + + results["per_query"].append(query_result) + + # Compute summary statistics + method_stats = defaultdict(lambda: {"counts": [], "latencies": []}) + for qr in results["per_query"]: + for method, data in qr["methods"].items(): + if "count" in data: + method_stats[method]["counts"].append(data["count"]) + if "latency_ms" in data: + method_stats[method]["latencies"].append(data["latency_ms"]) + + results["summary"] = { + method: { + "avg_count": sum(s["counts"]) / len(s["counts"]) if s["counts"] else 0, + "avg_latency_ms": sum(s["latencies"]) / len(s["latencies"]) if s["latencies"] else 0, + } + for method, s in method_stats.items() + } + + return results + + +def experiment_fts_rerank_fusion( + index_path: Path, + queries: List[str], + limit: int = 10, + coarse_k: int = 50 +) -> Dict[str, Any]: + """Experiment: FTS + Rerank fusion vs standard hybrid. + + Compares: + 1. Standard Hybrid (SPLADE + Vector RRF) + 2. FTS + CrossEncoder Rerank -> then fuse with Vector + """ + results = { + "per_query": [], + "summary": {} + } + + # Initialize reranker + try: + from codexlens.semantic.reranker import get_reranker, check_reranker_available + ok, _ = check_reranker_available("onnx") + if ok: + reranker = get_reranker(backend="onnx", use_gpu=True) + else: + reranker = None + except Exception as e: + print(f"Reranker unavailable: {e}") + reranker = None + + for query in queries: + query_result = { + "query": query, + "strategies": {} + } + + # Strategy 1: Standard Hybrid (SPLADE + Vector) + try: + engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) + engine._config = type('obj', (object,), { + 'enable_splade': True, + 'use_fts_fallback': False, + 'embedding_use_gpu': True, + })() + + start = time.perf_counter() + standard_results = engine.search( + index_path, query, limit=limit, + enable_vector=True + ) + standard_latency = (time.perf_counter() - start) * 1000 + + query_result["strategies"]["standard_hybrid"] = { + "count": len(standard_results), + "latency_ms": standard_latency, + "top_5": [r.path.split("\\")[-1] for r in standard_results[:5]], + "scores": [r.score for r in standard_results[:5]] + } + except Exception as e: + query_result["strategies"]["standard_hybrid"] = {"error": str(e)} + + # Strategy 2: FTS + Rerank -> Fuse with Vector + try: + # Step 1: Get FTS results (coarse) + fts_engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS) + fts_engine._config = type('obj', (object,), { + 'use_fts_fallback': True, + 'enable_splade': False, + 'embedding_use_gpu': True, + })() + + start = time.perf_counter() + fts_results = fts_engine.search( + index_path, query, limit=coarse_k, + enable_fuzzy=True, enable_vector=False + ) + fts_latency = (time.perf_counter() - start) * 1000 + + # Step 2: Rerank FTS results with CrossEncoder + if reranker and fts_results: + rerank_start = time.perf_counter() + reranked_fts = cross_encoder_rerank( + query, fts_results, reranker, top_k=20 + ) + rerank_latency = (time.perf_counter() - rerank_start) * 1000 + else: + reranked_fts = fts_results[:20] + rerank_latency = 0 + + # Step 3: Get Vector results + vector_engine = HybridSearchEngine() + vector_results = vector_engine.search( + index_path, query, limit=20, + enable_vector=True, pure_vector=True + ) + + # Step 4: Fuse reranked FTS with Vector + if reranked_fts and vector_results: + fusion_map = { + "fts_reranked": reranked_fts, + "vector": vector_results + } + fused_results = reciprocal_rank_fusion( + fusion_map, + weights={"fts_reranked": 0.5, "vector": 0.5}, + k=60 + ) + else: + fused_results = reranked_fts or vector_results or [] + + total_latency = fts_latency + rerank_latency + (time.perf_counter() - start) * 1000 + + query_result["strategies"]["fts_rerank_fusion"] = { + "count": len(fused_results), + "total_latency_ms": fts_latency + rerank_latency, + "fts_latency_ms": fts_latency, + "rerank_latency_ms": rerank_latency, + "top_5": [r.path.split("\\")[-1] for r in fused_results[:5]], + "scores": [r.score for r in fused_results[:5]] + } + except Exception as e: + query_result["strategies"]["fts_rerank_fusion"] = {"error": str(e)} + + # Compute overlap between strategies + if ( + "error" not in query_result["strategies"].get("standard_hybrid", {}) + and "error" not in query_result["strategies"].get("fts_rerank_fusion", {}) + ): + standard_paths = set(r.path.split("\\")[-1] for r in standard_results[:10]) + fts_rerank_paths = set(r.path.split("\\")[-1] for r in fused_results[:10]) + + overlap = len(standard_paths & fts_rerank_paths) + query_result["comparison"] = { + "top_10_overlap": overlap, + "standard_unique": list(standard_paths - fts_rerank_paths)[:3], + "fts_rerank_unique": list(fts_rerank_paths - standard_paths)[:3] + } + + results["per_query"].append(query_result) + + return results + + +def main(): + """Run all analyses.""" + source_path = Path("D:/Claude_dms3/codex-lens/src") + index_path = find_project_index(source_path) + + print(f"Using index: {index_path}") + print(f"Index exists: {index_path.exists()}") + print() + + # Test queries + queries = [ + "binary quantization", + "hamming distance search", + "embeddings generation", + "reranking algorithm", + "database connection handling", + ] + + # 1. Storage Architecture Analysis + print("=" * 60) + print("1. STORAGE ARCHITECTURE ANALYSIS") + print("=" * 60) + + storage_analysis = analyze_storage_architecture(index_path) + + print("\nTable Overview:") + for table, info in sorted(storage_analysis["tables"].items()): + if "row_count" in info: + print(f" {table}: {info['row_count']} rows") + + print("\nConflicts Detected:") + for conflict in storage_analysis["conflicts"]: + print(f" - {conflict['description']}") + + print("\nRecommendations:") + for rec in storage_analysis["recommendations"]: + print(f" - {rec}") + + # 2. Method Contribution Analysis + print("\n" + "=" * 60) + print("2. METHOD CONTRIBUTION ANALYSIS") + print("=" * 60) + + contribution_analysis = analyze_method_contributions(index_path, queries) + + print("\nPer-Query Results:") + for qr in contribution_analysis["per_query"]: + print(f"\n Query: '{qr['query']}'") + for method, data in qr["methods"].items(): + if "error" not in data: + print(f" {method}: {data['count']} results, {data['latency_ms']:.1f}ms") + if data.get("top_3_files"): + print(f" Top 3: {', '.join(data['top_3_files'])}") + + if qr.get("overlaps"): + print(" Overlaps:") + for pair, info in qr["overlaps"].items(): + print(f" {pair}: {info['overlap_count']} common (Jaccard: {info['jaccard']:.2f})") + + print("\nSummary:") + for method, stats in contribution_analysis["summary"].items(): + print(f" {method}: avg {stats['avg_count']:.1f} results, {stats['avg_latency_ms']:.1f}ms") + + # 3. FTS + Rerank Fusion Experiment + print("\n" + "=" * 60) + print("3. FTS + RERANK FUSION EXPERIMENT") + print("=" * 60) + + fusion_experiment = experiment_fts_rerank_fusion(index_path, queries) + + print("\nPer-Query Comparison:") + for qr in fusion_experiment["per_query"]: + print(f"\n Query: '{qr['query']}'") + for strategy, data in qr["strategies"].items(): + if "error" not in data: + latency = data.get("total_latency_ms") or data.get("latency_ms", 0) + print(f" {strategy}: {data['count']} results, {latency:.1f}ms") + if data.get("top_5"): + print(f" Top 5: {', '.join(data['top_5'][:3])}...") + + if qr.get("comparison"): + comp = qr["comparison"] + print(f" Top-10 Overlap: {comp['top_10_overlap']}/10") + + # Save full results + output_path = Path(__file__).parent / "results" / "method_contribution_analysis.json" + output_path.parent.mkdir(exist_ok=True) + + full_results = { + "storage_analysis": storage_analysis, + "contribution_analysis": contribution_analysis, + "fusion_experiment": fusion_experiment + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(full_results, f, indent=2, default=str) + + print(f"\n\nFull results saved to: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/codex-lens/benchmarks/results/method_contribution_analysis.json b/codex-lens/benchmarks/results/method_contribution_analysis.json new file mode 100644 index 00000000..f192b4fa --- /dev/null +++ b/codex-lens/benchmarks/results/method_contribution_analysis.json @@ -0,0 +1,406 @@ +{ + "storage_analysis": { + "tables": { + "code_relationships": { + "row_count": 0, + "columns": [ + "id", + "source_symbol_id", + "target_qualified_name", + "relationship_type", + "source_line", + "target_file" + ] + }, + "embeddings_config": { + "row_count": 1, + "columns": [ + "id", + "model_profile", + "model_name", + "embedding_dim", + "backend", + "created_at", + "updated_at" + ] + }, + "file_keywords": { + "row_count": 0, + "columns": [ + "file_id", + "keyword_id" + ] + }, + "files": { + "row_count": 0, + "columns": [ + "id", + "name", + "full_path", + "language", + "content", + "mtime", + "line_count" + ] + }, + "files_fts_exact": { + "row_count": 0, + "columns": [ + "name", + "full_path", + "content" + ] + }, + "files_fts_exact_config": { + "row_count": 1, + "columns": [ + "k", + "v" + ] + }, + "files_fts_exact_data": { + "row_count": 2, + "columns": [ + "id", + "block" + ] + }, + "files_fts_exact_docsize": { + "row_count": 0, + "columns": [ + "id", + "sz" + ] + }, + "files_fts_exact_idx": { + "row_count": 0, + "columns": [ + "segid", + "term", + "pgno" + ] + }, + "files_fts_fuzzy": { + "row_count": 0, + "columns": [ + "name", + "full_path", + "content" + ] + }, + "files_fts_fuzzy_config": { + "row_count": 1, + "columns": [ + "k", + "v" + ] + }, + "files_fts_fuzzy_data": { + "row_count": 2, + "columns": [ + "id", + "block" + ] + }, + "files_fts_fuzzy_docsize": { + "row_count": 0, + "columns": [ + "id", + "sz" + ] + }, + "files_fts_fuzzy_idx": { + "row_count": 0, + "columns": [ + "segid", + "term", + "pgno" + ] + }, + "graph_neighbors": { + "row_count": 0, + "columns": [ + "source_symbol_id", + "neighbor_symbol_id", + "relationship_depth" + ] + }, + "keywords": { + "row_count": 0, + "columns": [ + "id", + "keyword" + ] + }, + "merkle_hashes": { + "row_count": 0, + "columns": [ + "file_id", + "sha256", + "updated_at" + ] + }, + "merkle_state": { + "row_count": 1, + "columns": [ + "id", + "root_hash", + "updated_at" + ] + }, + "semantic_chunks": { + "row_count": 0, + "columns": [ + "id", + "file_path", + "content", + "embedding", + "metadata", + "created_at", + "embedding_binary", + "embedding_dense" + ] + }, + "semantic_metadata": { + "row_count": 0, + "columns": [ + "id", + "file_id", + "summary", + "purpose", + "llm_tool", + "generated_at" + ] + }, + "sqlite_sequence": { + "row_count": 0, + "columns": [ + "name", + "seq" + ] + }, + "subdirs": { + "row_count": 2, + "columns": [ + "id", + "name", + "index_path", + "files_count", + "last_updated" + ] + }, + "symbols": { + "row_count": 0, + "columns": [ + "id", + "file_id", + "name", + "kind", + "start_line", + "end_line" + ] + } + }, + "conflicts": [], + "recommendations": [ + "Found 10 FTS tables: ['files_fts_exact', 'files_fts_exact_config', 'files_fts_exact_data', 'files_fts_exact_docsize', 'files_fts_exact_idx', 'files_fts_fuzzy', 'files_fts_fuzzy_config', 'files_fts_fuzzy_data', 'files_fts_fuzzy_docsize', 'files_fts_fuzzy_idx']. Dual FTS (exact + fuzzy) is properly configured." + ] + }, + "contribution_analysis": { + "per_query": [ + { + "query": "binary quantization", + "methods": { + "fts_exact": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "fts_fuzzy": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "vector": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "splade": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + } + }, + "fusion_analysis": {}, + "overlaps": {} + }, + { + "query": "hamming distance search", + "methods": { + "fts_exact": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "fts_fuzzy": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "vector": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "splade": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + } + }, + "fusion_analysis": {}, + "overlaps": {} + }, + { + "query": "embeddings generation", + "methods": { + "fts_exact": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "fts_fuzzy": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "vector": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "splade": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + } + }, + "fusion_analysis": {}, + "overlaps": {} + }, + { + "query": "reranking algorithm", + "methods": { + "fts_exact": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "fts_fuzzy": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "vector": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "splade": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + } + }, + "fusion_analysis": {}, + "overlaps": {} + }, + { + "query": "database connection handling", + "methods": { + "fts_exact": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "fts_fuzzy": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "vector": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + }, + "splade": { + "error": "'obj' object has no attribute 'symbol_boost_factor'", + "count": 0 + } + }, + "fusion_analysis": {}, + "overlaps": {} + } + ], + "summary": { + "fts_exact": { + "avg_count": 0.0, + "avg_latency_ms": 0 + }, + "fts_fuzzy": { + "avg_count": 0.0, + "avg_latency_ms": 0 + }, + "vector": { + "avg_count": 0.0, + "avg_latency_ms": 0 + }, + "splade": { + "avg_count": 0.0, + "avg_latency_ms": 0 + } + } + }, + "fusion_experiment": { + "per_query": [ + { + "query": "binary quantization", + "strategies": { + "standard_hybrid": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + }, + "fts_rerank_fusion": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + } + } + }, + { + "query": "hamming distance search", + "strategies": { + "standard_hybrid": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + }, + "fts_rerank_fusion": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + } + } + }, + { + "query": "embeddings generation", + "strategies": { + "standard_hybrid": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + }, + "fts_rerank_fusion": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + } + } + }, + { + "query": "reranking algorithm", + "strategies": { + "standard_hybrid": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + }, + "fts_rerank_fusion": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + } + } + }, + { + "query": "database connection handling", + "strategies": { + "standard_hybrid": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + }, + "fts_rerank_fusion": { + "error": "'obj' object has no attribute 'symbol_boost_factor'" + } + } + } + ], + "summary": {} + } +} \ No newline at end of file diff --git a/codex-lens/src/codexlens/semantic/vector_store.py b/codex-lens/src/codexlens/semantic/vector_store.py index ed7c237d..bc947164 100644 --- a/codex-lens/src/codexlens/semantic/vector_store.py +++ b/codex-lens/src/codexlens/semantic/vector_store.py @@ -1033,6 +1033,28 @@ class VectorStore: row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone() return row[0] if row else 0 + def get_all_chunks(self) -> List[SemanticChunk]: + """Get all chunks from the store. + + Returns: + List of SemanticChunk objects with id and content. + """ + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, file_path, content, metadata FROM semantic_chunks" + ).fetchall() + + chunks = [] + for row in rows: + chunks.append(SemanticChunk( + id=row["id"], + content=row["content"], + file_path=row["file_path"], + metadata=json.loads(row["metadata"]) if row["metadata"] else None, + )) + return chunks + def clear_cache(self) -> None: """Manually clear the embedding cache.""" self._invalidate_cache()