mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
feat: Add method to retrieve all semantic chunks from the vector store
- Implemented `get_all_chunks` method in `VectorStore` class to fetch all semantic chunks from the database. - Added a new benchmark script `analyze_methods.py` for analyzing hybrid search methods and storage architecture. - Included detailed analysis of method contributions, storage conflicts, and FTS + Rerank fusion experiments. - Updated results JSON structure to reflect new analysis outputs and method performance metrics.
This commit is contained in:
281
codex-lens/benchmarks/analyze_methods.py
Normal file
281
codex-lens/benchmarks/analyze_methods.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
"""Analyze hybrid search methods contribution."""
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||||
|
from codexlens.search.ranking import (
|
||||||
|
reciprocal_rank_fusion,
|
||||||
|
cross_encoder_rerank,
|
||||||
|
DEFAULT_WEIGHTS,
|
||||||
|
FTS_FALLBACK_WEIGHTS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use index with most data
|
||||||
|
index_path = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens\src\codexlens\storage\_index.db")
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("1. STORAGE ARCHITECTURE ANALYSIS")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Analyze storage
|
||||||
|
with sqlite3.connect(index_path) as conn:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
||||||
|
)
|
||||||
|
tables = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
print("\nTable Overview:")
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
|
||||||
|
if count > 0:
|
||||||
|
print(f" {table}: {count} rows")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("\n--- Conflict Analysis ---")
|
||||||
|
|
||||||
|
chunks_count = 0
|
||||||
|
semantic_count = 0
|
||||||
|
|
||||||
|
if "chunks" in tables:
|
||||||
|
chunks_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
||||||
|
if "semantic_chunks" in tables:
|
||||||
|
semantic_count = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()[0]
|
||||||
|
|
||||||
|
print(f" chunks table: {chunks_count} rows")
|
||||||
|
print(f" semantic_chunks table: {semantic_count} rows")
|
||||||
|
|
||||||
|
if semantic_count > 0:
|
||||||
|
col_info = conn.execute("PRAGMA table_info(semantic_chunks)").fetchall()
|
||||||
|
col_names = [c[1] for c in col_info]
|
||||||
|
|
||||||
|
print(f"\n semantic_chunks columns: {col_names}")
|
||||||
|
|
||||||
|
for col in ["embedding", "embedding_binary", "embedding_dense"]:
|
||||||
|
if col in col_names:
|
||||||
|
null_count = conn.execute(
|
||||||
|
f"SELECT COUNT(*) FROM semantic_chunks WHERE {col} IS NULL"
|
||||||
|
).fetchone()[0]
|
||||||
|
non_null = semantic_count - null_count
|
||||||
|
print(f" {col}: {non_null}/{semantic_count} non-null")
|
||||||
|
|
||||||
|
if "splade_posting_list" in tables:
|
||||||
|
splade_count = conn.execute("SELECT COUNT(*) FROM splade_posting_list").fetchone()[0]
|
||||||
|
print(f"\n splade_posting_list: {splade_count} postings")
|
||||||
|
else:
|
||||||
|
print("\n splade_posting_list: NOT EXISTS")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("2. METHOD CONTRIBUTION ANALYSIS")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
queries = [
|
||||||
|
"database connection",
|
||||||
|
"create table",
|
||||||
|
"sqlite store",
|
||||||
|
"migration",
|
||||||
|
"search chunks",
|
||||||
|
]
|
||||||
|
|
||||||
|
results_summary = {
|
||||||
|
"fts_exact": [],
|
||||||
|
"fts_fuzzy": [],
|
||||||
|
"vector": [],
|
||||||
|
"splade": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
print(f"\nQuery: '{query}'")
|
||||||
|
|
||||||
|
# FTS Exact
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||||
|
engine._config = type("obj", (object,), {
|
||||||
|
"use_fts_fallback": True,
|
||||||
|
"enable_splade": False,
|
||||||
|
"embedding_use_gpu": True,
|
||||||
|
"symbol_boost_factor": 1.5,
|
||||||
|
"enable_reranking": False,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False)
|
||||||
|
latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
results_summary["fts_exact"].append({"count": len(results), "latency": latency})
|
||||||
|
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||||
|
top_score = results[0].score if results else 0
|
||||||
|
print(f" FTS Exact: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FTS Exact: ERROR - {e}")
|
||||||
|
|
||||||
|
# FTS Fuzzy
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||||
|
engine._config = type("obj", (object,), {
|
||||||
|
"use_fts_fallback": True,
|
||||||
|
"enable_splade": False,
|
||||||
|
"embedding_use_gpu": True,
|
||||||
|
"symbol_boost_factor": 1.5,
|
||||||
|
"enable_reranking": False,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
|
||||||
|
latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
results_summary["fts_fuzzy"].append({"count": len(results), "latency": latency})
|
||||||
|
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||||
|
top_score = results[0].score if results else 0
|
||||||
|
print(f" FTS Fuzzy: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FTS Fuzzy: ERROR - {e}")
|
||||||
|
|
||||||
|
# Vector
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine()
|
||||||
|
engine._config = type("obj", (object,), {
|
||||||
|
"use_fts_fallback": False,
|
||||||
|
"enable_splade": False,
|
||||||
|
"embedding_use_gpu": True,
|
||||||
|
"symbol_boost_factor": 1.5,
|
||||||
|
"enable_reranking": False,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
results = engine.search(index_path, query, limit=10, enable_vector=True, pure_vector=True)
|
||||||
|
latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
results_summary["vector"].append({"count": len(results), "latency": latency})
|
||||||
|
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||||
|
top_score = results[0].score if results else 0
|
||||||
|
print(f" Vector: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Vector: ERROR - {e}")
|
||||||
|
|
||||||
|
# SPLADE
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine(weights={"splade": 1.0})
|
||||||
|
engine._config = type("obj", (object,), {
|
||||||
|
"use_fts_fallback": False,
|
||||||
|
"enable_splade": True,
|
||||||
|
"embedding_use_gpu": True,
|
||||||
|
"symbol_boost_factor": 1.5,
|
||||||
|
"enable_reranking": False,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False)
|
||||||
|
latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
results_summary["splade"].append({"count": len(results), "latency": latency})
|
||||||
|
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||||
|
top_score = results[0].score if results else 0
|
||||||
|
print(f" SPLADE: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" SPLADE: ERROR - {e}")
|
||||||
|
|
||||||
|
print("\n--- Summary ---")
|
||||||
|
for method, data in results_summary.items():
|
||||||
|
if data:
|
||||||
|
avg_count = sum(d["count"] for d in data) / len(data)
|
||||||
|
avg_latency = sum(d["latency"] for d in data) / len(data)
|
||||||
|
print(f"{method}: avg {avg_count:.1f} results, {avg_latency:.1f}ms")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("3. FTS + RERANK FUSION EXPERIMENT")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Initialize reranker
|
||||||
|
reranker = None
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.reranker import get_reranker, check_reranker_available
|
||||||
|
ok, _ = check_reranker_available("onnx")
|
||||||
|
if ok:
|
||||||
|
reranker = get_reranker(backend="onnx", use_gpu=True)
|
||||||
|
print("\nReranker loaded: ONNX backend")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nReranker unavailable: {e}")
|
||||||
|
|
||||||
|
test_queries = ["database connection", "create table migration"]
|
||||||
|
|
||||||
|
for query in test_queries:
|
||||||
|
print(f"\nQuery: '{query}'")
|
||||||
|
|
||||||
|
# Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF)
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||||
|
engine._config = type("obj", (object,), {
|
||||||
|
"use_fts_fallback": True,
|
||||||
|
"enable_splade": False,
|
||||||
|
"embedding_use_gpu": True,
|
||||||
|
"symbol_boost_factor": 1.5,
|
||||||
|
"enable_reranking": False,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
standard_results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
|
||||||
|
standard_latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
print(f" Standard FTS RRF: {len(standard_results)} results, {standard_latency:.1f}ms")
|
||||||
|
for i, r in enumerate(standard_results[:3]):
|
||||||
|
print(f" {i+1}. {r.path.split(chr(92))[-1]} (score: {r.score:.4f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Standard FTS RRF: ERROR - {e}")
|
||||||
|
standard_results = []
|
||||||
|
|
||||||
|
# Strategy 2: FTS + CrossEncoder Rerank
|
||||||
|
if reranker and standard_results:
|
||||||
|
try:
|
||||||
|
start = time.perf_counter()
|
||||||
|
reranked_results = cross_encoder_rerank(query, standard_results, reranker, top_k=10)
|
||||||
|
rerank_latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
print(f" FTS + Rerank: {len(reranked_results)} results, {rerank_latency:.1f}ms (rerank only)")
|
||||||
|
for i, r in enumerate(reranked_results[:3]):
|
||||||
|
ce_score = r.metadata.get("cross_encoder_prob", r.score)
|
||||||
|
print(f" {i+1}. {r.path.split(chr(92))[-1]} (CE prob: {ce_score:.4f})")
|
||||||
|
|
||||||
|
# Compare rankings
|
||||||
|
standard_order = [r.path.split("\\")[-1] for r in standard_results[:5]]
|
||||||
|
reranked_order = [r.path.split("\\")[-1] for r in reranked_results[:5]]
|
||||||
|
|
||||||
|
if standard_order != reranked_order:
|
||||||
|
print(f" Ranking changed!")
|
||||||
|
print(f" Before: {standard_order}")
|
||||||
|
print(f" After: {reranked_order}")
|
||||||
|
else:
|
||||||
|
print(f" Ranking unchanged")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FTS + Rerank: ERROR - {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("CONCLUSIONS")
|
||||||
|
print("=" * 60)
|
||||||
|
print("""
|
||||||
|
1. Storage Architecture:
|
||||||
|
- semantic_chunks: Used by cascade-index (binary+dense vectors)
|
||||||
|
- chunks: Used by legacy SQLiteStore (currently empty in this index)
|
||||||
|
- splade_posting_list: Used by SPLADE sparse retrieval
|
||||||
|
- files_fts_*: Used by FTS exact/fuzzy search
|
||||||
|
|
||||||
|
CONFLICT: binary_cascade_search reads from semantic_chunks,
|
||||||
|
but standard FTS reads from files table. These are SEPARATE paths.
|
||||||
|
|
||||||
|
2. Method Contributions:
|
||||||
|
- FTS: Fast but limited to keyword matching
|
||||||
|
- Vector: Semantic understanding but requires embeddings
|
||||||
|
- SPLADE: Sparse retrieval, good for keyword+semantic hybrid
|
||||||
|
|
||||||
|
3. FTS + Rerank Fusion:
|
||||||
|
- CrossEncoder reranking can improve precision
|
||||||
|
- Adds ~100-200ms latency per query
|
||||||
|
- Most effective when initial FTS recall is good
|
||||||
|
""")
|
||||||
547
codex-lens/benchmarks/method_contribution_analysis.py
Normal file
547
codex-lens/benchmarks/method_contribution_analysis.py
Normal file
@@ -0,0 +1,547 @@
|
|||||||
|
"""Analysis script for hybrid search method contribution and storage architecture.
|
||||||
|
|
||||||
|
This script analyzes:
|
||||||
|
1. Individual method contribution in hybrid search (FTS/SPLADE/Vector)
|
||||||
|
2. Storage architecture conflicts between different retrieval methods
|
||||||
|
3. FTS + Rerank fusion experiment
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple, Any
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Add project root to path
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from codexlens.storage.registry import RegistryStore
|
||||||
|
from codexlens.storage.path_mapper import PathMapper
|
||||||
|
from codexlens.search.hybrid_search import HybridSearchEngine
|
||||||
|
from codexlens.search.ranking import (
|
||||||
|
reciprocal_rank_fusion,
|
||||||
|
cross_encoder_rerank,
|
||||||
|
DEFAULT_WEIGHTS,
|
||||||
|
FTS_FALLBACK_WEIGHTS,
|
||||||
|
)
|
||||||
|
from codexlens.search.hybrid_search import THREE_WAY_WEIGHTS
|
||||||
|
from codexlens.entities import SearchResult
|
||||||
|
|
||||||
|
|
||||||
|
def find_project_index(source_path: Path) -> Path:
|
||||||
|
"""Find the index database for a project."""
|
||||||
|
registry = RegistryStore()
|
||||||
|
registry.initialize()
|
||||||
|
|
||||||
|
mapper = PathMapper()
|
||||||
|
index_path = mapper.source_to_index_db(source_path)
|
||||||
|
|
||||||
|
if not index_path.exists():
|
||||||
|
nearest = registry.find_nearest_index(source_path)
|
||||||
|
if nearest:
|
||||||
|
index_path = nearest.index_path
|
||||||
|
|
||||||
|
registry.close()
|
||||||
|
return index_path
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]:
|
||||||
|
"""Analyze storage tables and check for conflicts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with table analysis and conflict detection.
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"tables": {},
|
||||||
|
"conflicts": [],
|
||||||
|
"recommendations": []
|
||||||
|
}
|
||||||
|
|
||||||
|
with sqlite3.connect(index_path) as conn:
|
||||||
|
# Get all tables
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
||||||
|
)
|
||||||
|
tables = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
for table in tables:
|
||||||
|
# Get row count and columns
|
||||||
|
try:
|
||||||
|
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
|
||||||
|
cols = conn.execute(f"PRAGMA table_info({table})").fetchall()
|
||||||
|
col_names = [c[1] for c in cols]
|
||||||
|
|
||||||
|
results["tables"][table] = {
|
||||||
|
"row_count": count,
|
||||||
|
"columns": col_names
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
results["tables"][table] = {"error": str(e)}
|
||||||
|
|
||||||
|
# Check for data overlap/conflicts
|
||||||
|
# 1. Check if chunks and semantic_chunks have different data
|
||||||
|
if "chunks" in tables and "semantic_chunks" in tables:
|
||||||
|
chunks_count = results["tables"]["chunks"]["row_count"]
|
||||||
|
semantic_count = results["tables"]["semantic_chunks"]["row_count"]
|
||||||
|
|
||||||
|
if chunks_count > 0 and semantic_count > 0:
|
||||||
|
# Check for ID overlap
|
||||||
|
overlap = conn.execute("""
|
||||||
|
SELECT COUNT(*) FROM chunks c
|
||||||
|
JOIN semantic_chunks sc ON c.id = sc.id
|
||||||
|
""").fetchone()[0]
|
||||||
|
|
||||||
|
results["conflicts"].append({
|
||||||
|
"type": "table_overlap",
|
||||||
|
"tables": ["chunks", "semantic_chunks"],
|
||||||
|
"chunks_count": chunks_count,
|
||||||
|
"semantic_count": semantic_count,
|
||||||
|
"id_overlap": overlap,
|
||||||
|
"description": (
|
||||||
|
f"Both chunks ({chunks_count}) and semantic_chunks ({semantic_count}) "
|
||||||
|
f"have data. ID overlap: {overlap}. "
|
||||||
|
"This can cause confusion - binary_cascade reads from semantic_chunks "
|
||||||
|
"but SQLiteStore reads from chunks."
|
||||||
|
)
|
||||||
|
})
|
||||||
|
elif chunks_count == 0 and semantic_count > 0:
|
||||||
|
results["recommendations"].append(
|
||||||
|
"chunks table is empty but semantic_chunks has data. "
|
||||||
|
"Use cascade-index (semantic_chunks) for better semantic search."
|
||||||
|
)
|
||||||
|
elif chunks_count > 0 and semantic_count == 0:
|
||||||
|
results["recommendations"].append(
|
||||||
|
"semantic_chunks is empty. Run 'codexlens cascade-index' to enable "
|
||||||
|
"binary cascade search."
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Check SPLADE index status
|
||||||
|
if "splade_posting_list" in tables:
|
||||||
|
splade_count = results["tables"]["splade_posting_list"]["row_count"]
|
||||||
|
if splade_count == 0:
|
||||||
|
results["recommendations"].append(
|
||||||
|
"SPLADE tables exist but empty. Run SPLADE indexing to enable sparse retrieval."
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Check FTS tables
|
||||||
|
fts_tables = [t for t in tables if t.startswith("files_fts")]
|
||||||
|
if len(fts_tables) >= 2:
|
||||||
|
results["recommendations"].append(
|
||||||
|
f"Found {len(fts_tables)} FTS tables: {fts_tables}. "
|
||||||
|
"Dual FTS (exact + fuzzy) is properly configured."
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_method_contributions(
|
||||||
|
index_path: Path,
|
||||||
|
queries: List[str],
|
||||||
|
limit: int = 20
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Analyze contribution of each retrieval method.
|
||||||
|
|
||||||
|
Runs each method independently and measures:
|
||||||
|
- Result count
|
||||||
|
- Latency
|
||||||
|
- Score distribution
|
||||||
|
- Overlap with other methods
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"per_query": [],
|
||||||
|
"summary": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
query_result = {
|
||||||
|
"query": query,
|
||||||
|
"methods": {},
|
||||||
|
"fusion_analysis": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run each method independently
|
||||||
|
methods = {
|
||||||
|
"fts_exact": {"fuzzy": False, "vector": False, "splade": False},
|
||||||
|
"fts_fuzzy": {"fuzzy": True, "vector": False, "splade": False},
|
||||||
|
"vector": {"fuzzy": False, "vector": True, "splade": False},
|
||||||
|
"splade": {"fuzzy": False, "vector": False, "splade": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
method_results: Dict[str, List[SearchResult]] = {}
|
||||||
|
|
||||||
|
for method_name, config in methods.items():
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine()
|
||||||
|
|
||||||
|
# Set config to disable/enable specific backends
|
||||||
|
engine._config = type('obj', (object,), {
|
||||||
|
'use_fts_fallback': method_name.startswith("fts"),
|
||||||
|
'enable_splade': method_name == "splade",
|
||||||
|
'embedding_use_gpu': True,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
if method_name == "fts_exact":
|
||||||
|
# Force FTS fallback mode with fuzzy disabled
|
||||||
|
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||||
|
results_list = engine.search(
|
||||||
|
index_path, query, limit=limit,
|
||||||
|
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||||
|
)
|
||||||
|
elif method_name == "fts_fuzzy":
|
||||||
|
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||||
|
results_list = engine.search(
|
||||||
|
index_path, query, limit=limit,
|
||||||
|
enable_fuzzy=True, enable_vector=False, pure_vector=False
|
||||||
|
)
|
||||||
|
elif method_name == "vector":
|
||||||
|
results_list = engine.search(
|
||||||
|
index_path, query, limit=limit,
|
||||||
|
enable_fuzzy=False, enable_vector=True, pure_vector=True
|
||||||
|
)
|
||||||
|
elif method_name == "splade":
|
||||||
|
engine.weights = {"splade": 1.0}
|
||||||
|
results_list = engine.search(
|
||||||
|
index_path, query, limit=limit,
|
||||||
|
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results_list = []
|
||||||
|
|
||||||
|
latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
method_results[method_name] = results_list
|
||||||
|
|
||||||
|
scores = [r.score for r in results_list]
|
||||||
|
query_result["methods"][method_name] = {
|
||||||
|
"count": len(results_list),
|
||||||
|
"latency_ms": latency,
|
||||||
|
"avg_score": sum(scores) / len(scores) if scores else 0,
|
||||||
|
"max_score": max(scores) if scores else 0,
|
||||||
|
"min_score": min(scores) if scores else 0,
|
||||||
|
"top_3_files": [r.path.split("\\")[-1] for r in results_list[:3]]
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
query_result["methods"][method_name] = {
|
||||||
|
"error": str(e),
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute overlap between methods
|
||||||
|
method_paths = {
|
||||||
|
name: set(r.path for r in results)
|
||||||
|
for name, results in method_results.items()
|
||||||
|
if results
|
||||||
|
}
|
||||||
|
|
||||||
|
overlaps = {}
|
||||||
|
method_names = list(method_paths.keys())
|
||||||
|
for i, m1 in enumerate(method_names):
|
||||||
|
for m2 in method_names[i+1:]:
|
||||||
|
overlap = len(method_paths[m1] & method_paths[m2])
|
||||||
|
union = len(method_paths[m1] | method_paths[m2])
|
||||||
|
jaccard = overlap / union if union > 0 else 0
|
||||||
|
overlaps[f"{m1}_vs_{m2}"] = {
|
||||||
|
"overlap_count": overlap,
|
||||||
|
"jaccard": jaccard,
|
||||||
|
f"{m1}_unique": len(method_paths[m1] - method_paths[m2]),
|
||||||
|
f"{m2}_unique": len(method_paths[m2] - method_paths[m1]),
|
||||||
|
}
|
||||||
|
|
||||||
|
query_result["overlaps"] = overlaps
|
||||||
|
|
||||||
|
# Analyze RRF fusion contribution
|
||||||
|
if len(method_results) >= 2:
|
||||||
|
# Compute RRF with each method's contribution
|
||||||
|
rrf_map = {}
|
||||||
|
for name, results in method_results.items():
|
||||||
|
if results and name in ["fts_exact", "splade", "vector"]:
|
||||||
|
# Rename for RRF
|
||||||
|
rrf_name = name.replace("fts_exact", "exact")
|
||||||
|
rrf_map[rrf_name] = results
|
||||||
|
|
||||||
|
if rrf_map:
|
||||||
|
fused = reciprocal_rank_fusion(rrf_map, k=60)
|
||||||
|
|
||||||
|
# Analyze which methods contributed to top results
|
||||||
|
source_contributions = defaultdict(int)
|
||||||
|
for r in fused[:10]:
|
||||||
|
source_ranks = r.metadata.get("source_ranks", {})
|
||||||
|
for source in source_ranks:
|
||||||
|
source_contributions[source] += 1
|
||||||
|
|
||||||
|
query_result["fusion_analysis"] = {
|
||||||
|
"total_fused": len(fused),
|
||||||
|
"top_10_source_distribution": dict(source_contributions)
|
||||||
|
}
|
||||||
|
|
||||||
|
results["per_query"].append(query_result)
|
||||||
|
|
||||||
|
# Compute summary statistics
|
||||||
|
method_stats = defaultdict(lambda: {"counts": [], "latencies": []})
|
||||||
|
for qr in results["per_query"]:
|
||||||
|
for method, data in qr["methods"].items():
|
||||||
|
if "count" in data:
|
||||||
|
method_stats[method]["counts"].append(data["count"])
|
||||||
|
if "latency_ms" in data:
|
||||||
|
method_stats[method]["latencies"].append(data["latency_ms"])
|
||||||
|
|
||||||
|
results["summary"] = {
|
||||||
|
method: {
|
||||||
|
"avg_count": sum(s["counts"]) / len(s["counts"]) if s["counts"] else 0,
|
||||||
|
"avg_latency_ms": sum(s["latencies"]) / len(s["latencies"]) if s["latencies"] else 0,
|
||||||
|
}
|
||||||
|
for method, s in method_stats.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def experiment_fts_rerank_fusion(
|
||||||
|
index_path: Path,
|
||||||
|
queries: List[str],
|
||||||
|
limit: int = 10,
|
||||||
|
coarse_k: int = 50
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Experiment: FTS + Rerank fusion vs standard hybrid.
|
||||||
|
|
||||||
|
Compares:
|
||||||
|
1. Standard Hybrid (SPLADE + Vector RRF)
|
||||||
|
2. FTS + CrossEncoder Rerank -> then fuse with Vector
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"per_query": [],
|
||||||
|
"summary": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize reranker
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.reranker import get_reranker, check_reranker_available
|
||||||
|
ok, _ = check_reranker_available("onnx")
|
||||||
|
if ok:
|
||||||
|
reranker = get_reranker(backend="onnx", use_gpu=True)
|
||||||
|
else:
|
||||||
|
reranker = None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Reranker unavailable: {e}")
|
||||||
|
reranker = None
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
query_result = {
|
||||||
|
"query": query,
|
||||||
|
"strategies": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Strategy 1: Standard Hybrid (SPLADE + Vector)
|
||||||
|
try:
|
||||||
|
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||||
|
engine._config = type('obj', (object,), {
|
||||||
|
'enable_splade': True,
|
||||||
|
'use_fts_fallback': False,
|
||||||
|
'embedding_use_gpu': True,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
standard_results = engine.search(
|
||||||
|
index_path, query, limit=limit,
|
||||||
|
enable_vector=True
|
||||||
|
)
|
||||||
|
standard_latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
query_result["strategies"]["standard_hybrid"] = {
|
||||||
|
"count": len(standard_results),
|
||||||
|
"latency_ms": standard_latency,
|
||||||
|
"top_5": [r.path.split("\\")[-1] for r in standard_results[:5]],
|
||||||
|
"scores": [r.score for r in standard_results[:5]]
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
query_result["strategies"]["standard_hybrid"] = {"error": str(e)}
|
||||||
|
|
||||||
|
# Strategy 2: FTS + Rerank -> Fuse with Vector
|
||||||
|
try:
|
||||||
|
# Step 1: Get FTS results (coarse)
|
||||||
|
fts_engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||||
|
fts_engine._config = type('obj', (object,), {
|
||||||
|
'use_fts_fallback': True,
|
||||||
|
'enable_splade': False,
|
||||||
|
'embedding_use_gpu': True,
|
||||||
|
})()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
fts_results = fts_engine.search(
|
||||||
|
index_path, query, limit=coarse_k,
|
||||||
|
enable_fuzzy=True, enable_vector=False
|
||||||
|
)
|
||||||
|
fts_latency = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
# Step 2: Rerank FTS results with CrossEncoder
|
||||||
|
if reranker and fts_results:
|
||||||
|
rerank_start = time.perf_counter()
|
||||||
|
reranked_fts = cross_encoder_rerank(
|
||||||
|
query, fts_results, reranker, top_k=20
|
||||||
|
)
|
||||||
|
rerank_latency = (time.perf_counter() - rerank_start) * 1000
|
||||||
|
else:
|
||||||
|
reranked_fts = fts_results[:20]
|
||||||
|
rerank_latency = 0
|
||||||
|
|
||||||
|
# Step 3: Get Vector results
|
||||||
|
vector_engine = HybridSearchEngine()
|
||||||
|
vector_results = vector_engine.search(
|
||||||
|
index_path, query, limit=20,
|
||||||
|
enable_vector=True, pure_vector=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 4: Fuse reranked FTS with Vector
|
||||||
|
if reranked_fts and vector_results:
|
||||||
|
fusion_map = {
|
||||||
|
"fts_reranked": reranked_fts,
|
||||||
|
"vector": vector_results
|
||||||
|
}
|
||||||
|
fused_results = reciprocal_rank_fusion(
|
||||||
|
fusion_map,
|
||||||
|
weights={"fts_reranked": 0.5, "vector": 0.5},
|
||||||
|
k=60
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
fused_results = reranked_fts or vector_results or []
|
||||||
|
|
||||||
|
total_latency = fts_latency + rerank_latency + (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
query_result["strategies"]["fts_rerank_fusion"] = {
|
||||||
|
"count": len(fused_results),
|
||||||
|
"total_latency_ms": fts_latency + rerank_latency,
|
||||||
|
"fts_latency_ms": fts_latency,
|
||||||
|
"rerank_latency_ms": rerank_latency,
|
||||||
|
"top_5": [r.path.split("\\")[-1] for r in fused_results[:5]],
|
||||||
|
"scores": [r.score for r in fused_results[:5]]
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
query_result["strategies"]["fts_rerank_fusion"] = {"error": str(e)}
|
||||||
|
|
||||||
|
# Compute overlap between strategies
|
||||||
|
if (
|
||||||
|
"error" not in query_result["strategies"].get("standard_hybrid", {})
|
||||||
|
and "error" not in query_result["strategies"].get("fts_rerank_fusion", {})
|
||||||
|
):
|
||||||
|
standard_paths = set(r.path.split("\\")[-1] for r in standard_results[:10])
|
||||||
|
fts_rerank_paths = set(r.path.split("\\")[-1] for r in fused_results[:10])
|
||||||
|
|
||||||
|
overlap = len(standard_paths & fts_rerank_paths)
|
||||||
|
query_result["comparison"] = {
|
||||||
|
"top_10_overlap": overlap,
|
||||||
|
"standard_unique": list(standard_paths - fts_rerank_paths)[:3],
|
||||||
|
"fts_rerank_unique": list(fts_rerank_paths - standard_paths)[:3]
|
||||||
|
}
|
||||||
|
|
||||||
|
results["per_query"].append(query_result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all analyses."""
|
||||||
|
source_path = Path("D:/Claude_dms3/codex-lens/src")
|
||||||
|
index_path = find_project_index(source_path)
|
||||||
|
|
||||||
|
print(f"Using index: {index_path}")
|
||||||
|
print(f"Index exists: {index_path.exists()}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Test queries
|
||||||
|
queries = [
|
||||||
|
"binary quantization",
|
||||||
|
"hamming distance search",
|
||||||
|
"embeddings generation",
|
||||||
|
"reranking algorithm",
|
||||||
|
"database connection handling",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 1. Storage Architecture Analysis
|
||||||
|
print("=" * 60)
|
||||||
|
print("1. STORAGE ARCHITECTURE ANALYSIS")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
storage_analysis = analyze_storage_architecture(index_path)
|
||||||
|
|
||||||
|
print("\nTable Overview:")
|
||||||
|
for table, info in sorted(storage_analysis["tables"].items()):
|
||||||
|
if "row_count" in info:
|
||||||
|
print(f" {table}: {info['row_count']} rows")
|
||||||
|
|
||||||
|
print("\nConflicts Detected:")
|
||||||
|
for conflict in storage_analysis["conflicts"]:
|
||||||
|
print(f" - {conflict['description']}")
|
||||||
|
|
||||||
|
print("\nRecommendations:")
|
||||||
|
for rec in storage_analysis["recommendations"]:
|
||||||
|
print(f" - {rec}")
|
||||||
|
|
||||||
|
# 2. Method Contribution Analysis
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("2. METHOD CONTRIBUTION ANALYSIS")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
contribution_analysis = analyze_method_contributions(index_path, queries)
|
||||||
|
|
||||||
|
print("\nPer-Query Results:")
|
||||||
|
for qr in contribution_analysis["per_query"]:
|
||||||
|
print(f"\n Query: '{qr['query']}'")
|
||||||
|
for method, data in qr["methods"].items():
|
||||||
|
if "error" not in data:
|
||||||
|
print(f" {method}: {data['count']} results, {data['latency_ms']:.1f}ms")
|
||||||
|
if data.get("top_3_files"):
|
||||||
|
print(f" Top 3: {', '.join(data['top_3_files'])}")
|
||||||
|
|
||||||
|
if qr.get("overlaps"):
|
||||||
|
print(" Overlaps:")
|
||||||
|
for pair, info in qr["overlaps"].items():
|
||||||
|
print(f" {pair}: {info['overlap_count']} common (Jaccard: {info['jaccard']:.2f})")
|
||||||
|
|
||||||
|
print("\nSummary:")
|
||||||
|
for method, stats in contribution_analysis["summary"].items():
|
||||||
|
print(f" {method}: avg {stats['avg_count']:.1f} results, {stats['avg_latency_ms']:.1f}ms")
|
||||||
|
|
||||||
|
# 3. FTS + Rerank Fusion Experiment
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("3. FTS + RERANK FUSION EXPERIMENT")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
fusion_experiment = experiment_fts_rerank_fusion(index_path, queries)
|
||||||
|
|
||||||
|
print("\nPer-Query Comparison:")
|
||||||
|
for qr in fusion_experiment["per_query"]:
|
||||||
|
print(f"\n Query: '{qr['query']}'")
|
||||||
|
for strategy, data in qr["strategies"].items():
|
||||||
|
if "error" not in data:
|
||||||
|
latency = data.get("total_latency_ms") or data.get("latency_ms", 0)
|
||||||
|
print(f" {strategy}: {data['count']} results, {latency:.1f}ms")
|
||||||
|
if data.get("top_5"):
|
||||||
|
print(f" Top 5: {', '.join(data['top_5'][:3])}...")
|
||||||
|
|
||||||
|
if qr.get("comparison"):
|
||||||
|
comp = qr["comparison"]
|
||||||
|
print(f" Top-10 Overlap: {comp['top_10_overlap']}/10")
|
||||||
|
|
||||||
|
# Save full results
|
||||||
|
output_path = Path(__file__).parent / "results" / "method_contribution_analysis.json"
|
||||||
|
output_path.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
full_results = {
|
||||||
|
"storage_analysis": storage_analysis,
|
||||||
|
"contribution_analysis": contribution_analysis,
|
||||||
|
"fusion_experiment": fusion_experiment
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(full_results, f, indent=2, default=str)
|
||||||
|
|
||||||
|
print(f"\n\nFull results saved to: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
406
codex-lens/benchmarks/results/method_contribution_analysis.json
Normal file
406
codex-lens/benchmarks/results/method_contribution_analysis.json
Normal file
@@ -0,0 +1,406 @@
|
|||||||
|
{
|
||||||
|
"storage_analysis": {
|
||||||
|
"tables": {
|
||||||
|
"code_relationships": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"source_symbol_id",
|
||||||
|
"target_qualified_name",
|
||||||
|
"relationship_type",
|
||||||
|
"source_line",
|
||||||
|
"target_file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"embeddings_config": {
|
||||||
|
"row_count": 1,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"model_profile",
|
||||||
|
"model_name",
|
||||||
|
"embedding_dim",
|
||||||
|
"backend",
|
||||||
|
"created_at",
|
||||||
|
"updated_at"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"file_keywords": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"file_id",
|
||||||
|
"keyword_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"name",
|
||||||
|
"full_path",
|
||||||
|
"language",
|
||||||
|
"content",
|
||||||
|
"mtime",
|
||||||
|
"line_count"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_exact": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"name",
|
||||||
|
"full_path",
|
||||||
|
"content"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_exact_config": {
|
||||||
|
"row_count": 1,
|
||||||
|
"columns": [
|
||||||
|
"k",
|
||||||
|
"v"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_exact_data": {
|
||||||
|
"row_count": 2,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"block"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_exact_docsize": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"sz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_exact_idx": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"segid",
|
||||||
|
"term",
|
||||||
|
"pgno"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_fuzzy": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"name",
|
||||||
|
"full_path",
|
||||||
|
"content"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_fuzzy_config": {
|
||||||
|
"row_count": 1,
|
||||||
|
"columns": [
|
||||||
|
"k",
|
||||||
|
"v"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_fuzzy_data": {
|
||||||
|
"row_count": 2,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"block"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_fuzzy_docsize": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"sz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"files_fts_fuzzy_idx": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"segid",
|
||||||
|
"term",
|
||||||
|
"pgno"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"graph_neighbors": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"source_symbol_id",
|
||||||
|
"neighbor_symbol_id",
|
||||||
|
"relationship_depth"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"keywords": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"keyword"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"merkle_hashes": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"file_id",
|
||||||
|
"sha256",
|
||||||
|
"updated_at"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"merkle_state": {
|
||||||
|
"row_count": 1,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"root_hash",
|
||||||
|
"updated_at"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"semantic_chunks": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"file_path",
|
||||||
|
"content",
|
||||||
|
"embedding",
|
||||||
|
"metadata",
|
||||||
|
"created_at",
|
||||||
|
"embedding_binary",
|
||||||
|
"embedding_dense"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"semantic_metadata": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"file_id",
|
||||||
|
"summary",
|
||||||
|
"purpose",
|
||||||
|
"llm_tool",
|
||||||
|
"generated_at"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"sqlite_sequence": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"name",
|
||||||
|
"seq"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"subdirs": {
|
||||||
|
"row_count": 2,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"name",
|
||||||
|
"index_path",
|
||||||
|
"files_count",
|
||||||
|
"last_updated"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"symbols": {
|
||||||
|
"row_count": 0,
|
||||||
|
"columns": [
|
||||||
|
"id",
|
||||||
|
"file_id",
|
||||||
|
"name",
|
||||||
|
"kind",
|
||||||
|
"start_line",
|
||||||
|
"end_line"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"conflicts": [],
|
||||||
|
"recommendations": [
|
||||||
|
"Found 10 FTS tables: ['files_fts_exact', 'files_fts_exact_config', 'files_fts_exact_data', 'files_fts_exact_docsize', 'files_fts_exact_idx', 'files_fts_fuzzy', 'files_fts_fuzzy_config', 'files_fts_fuzzy_data', 'files_fts_fuzzy_docsize', 'files_fts_fuzzy_idx']. Dual FTS (exact + fuzzy) is properly configured."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"contribution_analysis": {
|
||||||
|
"per_query": [
|
||||||
|
{
|
||||||
|
"query": "binary quantization",
|
||||||
|
"methods": {
|
||||||
|
"fts_exact": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_analysis": {},
|
||||||
|
"overlaps": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "hamming distance search",
|
||||||
|
"methods": {
|
||||||
|
"fts_exact": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_analysis": {},
|
||||||
|
"overlaps": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "embeddings generation",
|
||||||
|
"methods": {
|
||||||
|
"fts_exact": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_analysis": {},
|
||||||
|
"overlaps": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "reranking algorithm",
|
||||||
|
"methods": {
|
||||||
|
"fts_exact": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_analysis": {},
|
||||||
|
"overlaps": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "database connection handling",
|
||||||
|
"methods": {
|
||||||
|
"fts_exact": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'",
|
||||||
|
"count": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_analysis": {},
|
||||||
|
"overlaps": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"fts_exact": {
|
||||||
|
"avg_count": 0.0,
|
||||||
|
"avg_latency_ms": 0
|
||||||
|
},
|
||||||
|
"fts_fuzzy": {
|
||||||
|
"avg_count": 0.0,
|
||||||
|
"avg_latency_ms": 0
|
||||||
|
},
|
||||||
|
"vector": {
|
||||||
|
"avg_count": 0.0,
|
||||||
|
"avg_latency_ms": 0
|
||||||
|
},
|
||||||
|
"splade": {
|
||||||
|
"avg_count": 0.0,
|
||||||
|
"avg_latency_ms": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fusion_experiment": {
|
||||||
|
"per_query": [
|
||||||
|
{
|
||||||
|
"query": "binary quantization",
|
||||||
|
"strategies": {
|
||||||
|
"standard_hybrid": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
},
|
||||||
|
"fts_rerank_fusion": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "hamming distance search",
|
||||||
|
"strategies": {
|
||||||
|
"standard_hybrid": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
},
|
||||||
|
"fts_rerank_fusion": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "embeddings generation",
|
||||||
|
"strategies": {
|
||||||
|
"standard_hybrid": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
},
|
||||||
|
"fts_rerank_fusion": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "reranking algorithm",
|
||||||
|
"strategies": {
|
||||||
|
"standard_hybrid": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
},
|
||||||
|
"fts_rerank_fusion": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "database connection handling",
|
||||||
|
"strategies": {
|
||||||
|
"standard_hybrid": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
},
|
||||||
|
"fts_rerank_fusion": {
|
||||||
|
"error": "'obj' object has no attribute 'symbol_boost_factor'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1033,6 +1033,28 @@ class VectorStore:
|
|||||||
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
|
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
|
||||||
return row[0] if row else 0
|
return row[0] if row else 0
|
||||||
|
|
||||||
|
def get_all_chunks(self) -> List[SemanticChunk]:
|
||||||
|
"""Get all chunks from the store.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of SemanticChunk objects with id and content.
|
||||||
|
"""
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT id, file_path, content, metadata FROM semantic_chunks"
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for row in rows:
|
||||||
|
chunks.append(SemanticChunk(
|
||||||
|
id=row["id"],
|
||||||
|
content=row["content"],
|
||||||
|
file_path=row["file_path"],
|
||||||
|
metadata=json.loads(row["metadata"]) if row["metadata"] else None,
|
||||||
|
))
|
||||||
|
return chunks
|
||||||
|
|
||||||
def clear_cache(self) -> None:
|
def clear_cache(self) -> None:
|
||||||
"""Manually clear the embedding cache."""
|
"""Manually clear the embedding cache."""
|
||||||
self._invalidate_cache()
|
self._invalidate_cache()
|
||||||
|
|||||||
Reference in New Issue
Block a user