mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
refactor: 移除 SPLADE 和 hybrid_cascade,精简搜索架构
删除 SPLADE 稀疏神经搜索后端和 hybrid_cascade 策略,
将搜索架构从 6 种后端简化为 4 种(FTS Exact/Fuzzy, Binary Vector, Dense Vector, LSP)。
主要变更:
- 删除 splade_encoder.py, splade_index.py, migration_009 等 4 个文件
- 移除 config.py 中 SPLADE 相关配置(enable_splade, splade_model 等)
- DEFAULT_WEIGHTS 改为 FTS 权重 {exact:0.25, fuzzy:0.1, vector:0.5, lsp:0.15}
- 删除 hybrid_cascade_search(),所有 cascade fallback 改为 self.search()
- API fusion_strategy='hybrid' 向后兼容映射到 binary_rerank
- 删除 CLI index_splade/splade_status 命令和 --method splade
- 更新测试、基准测试和文档
This commit is contained in:
@@ -12,7 +12,6 @@ from codexlens.search.ranking import (
|
||||
reciprocal_rank_fusion,
|
||||
cross_encoder_rerank,
|
||||
DEFAULT_WEIGHTS,
|
||||
FTS_FALLBACK_WEIGHTS,
|
||||
)
|
||||
|
||||
# Use index with most data
|
||||
@@ -65,12 +64,6 @@ with sqlite3.connect(index_path) as conn:
|
||||
non_null = semantic_count - null_count
|
||||
print(f" {col}: {non_null}/{semantic_count} non-null")
|
||||
|
||||
if "splade_posting_list" in tables:
|
||||
splade_count = conn.execute("SELECT COUNT(*) FROM splade_posting_list").fetchone()[0]
|
||||
print(f"\n splade_posting_list: {splade_count} postings")
|
||||
else:
|
||||
print("\n splade_posting_list: NOT EXISTS")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("2. METHOD CONTRIBUTION ANALYSIS")
|
||||
print("=" * 60)
|
||||
@@ -87,7 +80,6 @@ results_summary = {
|
||||
"fts_exact": [],
|
||||
"fts_fuzzy": [],
|
||||
"vector": [],
|
||||
"splade": [],
|
||||
}
|
||||
|
||||
for query in queries:
|
||||
@@ -95,10 +87,9 @@ for query in queries:
|
||||
|
||||
# FTS Exact
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"enable_splade": False,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
@@ -117,10 +108,9 @@ for query in queries:
|
||||
|
||||
# FTS Fuzzy
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"enable_splade": False,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
@@ -142,7 +132,6 @@ for query in queries:
|
||||
engine = HybridSearchEngine()
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": False,
|
||||
"enable_splade": False,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
@@ -159,28 +148,6 @@ for query in queries:
|
||||
except Exception as e:
|
||||
print(f" Vector: ERROR - {e}")
|
||||
|
||||
# SPLADE
|
||||
try:
|
||||
engine = HybridSearchEngine(weights={"splade": 1.0})
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": False,
|
||||
"enable_splade": True,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
})()
|
||||
|
||||
start = time.perf_counter()
|
||||
results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
results_summary["splade"].append({"count": len(results), "latency": latency})
|
||||
top_file = results[0].path.split("\\")[-1] if results else "N/A"
|
||||
top_score = results[0].score if results else 0
|
||||
print(f" SPLADE: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
|
||||
except Exception as e:
|
||||
print(f" SPLADE: ERROR - {e}")
|
||||
|
||||
print("\n--- Summary ---")
|
||||
for method, data in results_summary.items():
|
||||
if data:
|
||||
@@ -210,10 +177,9 @@ for query in test_queries:
|
||||
|
||||
# Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF)
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type("obj", (object,), {
|
||||
"use_fts_fallback": True,
|
||||
"enable_splade": False,
|
||||
"embedding_use_gpu": True,
|
||||
"symbol_boost_factor": 1.5,
|
||||
"enable_reranking": False,
|
||||
@@ -263,7 +229,6 @@ print("""
|
||||
1. Storage Architecture:
|
||||
- semantic_chunks: Used by cascade-index (binary+dense vectors)
|
||||
- chunks: Used by legacy SQLiteStore (currently empty in this index)
|
||||
- splade_posting_list: Used by SPLADE sparse retrieval
|
||||
- files_fts_*: Used by FTS exact/fuzzy search
|
||||
|
||||
CONFLICT: binary_cascade_search reads from semantic_chunks,
|
||||
@@ -272,7 +237,6 @@ print("""
|
||||
2. Method Contributions:
|
||||
- FTS: Fast but limited to keyword matching
|
||||
- Vector: Semantic understanding but requires embeddings
|
||||
- SPLADE: Sparse retrieval, good for keyword+semantic hybrid
|
||||
|
||||
3. FTS + Rerank Fusion:
|
||||
- CrossEncoder reranking can improve precision
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
Compares:
|
||||
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
|
||||
- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking
|
||||
- hybrid: FTS+Vector coarse ranking + CrossEncoder fine ranking
|
||||
|
||||
Usage:
|
||||
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
"""Compare Binary Cascade, SPLADE, and Vector semantic search methods.
|
||||
"""Compare Binary Cascade and Vector semantic search methods.
|
||||
|
||||
This script compares the three semantic retrieval approaches:
|
||||
This script compares the two semantic retrieval approaches:
|
||||
1. Binary Cascade: 256-bit binary vectors for coarse ranking
|
||||
2. SPLADE: Sparse learned representations with inverted index
|
||||
3. Vector Dense: Full semantic embeddings with cosine similarity
|
||||
2. Vector Dense: Full semantic embeddings with cosine similarity
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -14,7 +13,6 @@ from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.splade_index import SpladeIndex
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
|
||||
@@ -27,19 +25,6 @@ def get_filename(path: str) -> str:
|
||||
return path
|
||||
|
||||
|
||||
def find_splade_db(index_root: Path) -> Path:
|
||||
"""Find SPLADE database by searching directory tree."""
|
||||
# Check root first
|
||||
if (index_root / "_splade.db").exists():
|
||||
return index_root / "_splade.db"
|
||||
|
||||
# Search in subdirectories
|
||||
for splade_db in index_root.rglob("_splade.db"):
|
||||
return splade_db
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_binary_indexes(index_root: Path):
|
||||
"""Find all binary index files."""
|
||||
return list(index_root.rglob("_index_binary_vectors.bin"))
|
||||
@@ -108,55 +93,6 @@ def test_vector_search(query: str, limit: int = 10):
|
||||
return [], 0, str(e)
|
||||
|
||||
|
||||
def test_splade_search(query: str, limit: int = 10):
|
||||
"""Test SPLADE sparse search."""
|
||||
try:
|
||||
from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
|
||||
|
||||
ok, err = check_splade_available()
|
||||
if not ok:
|
||||
return [], 0, f"SPLADE not available: {err}"
|
||||
|
||||
splade_db_path = find_splade_db(INDEX_ROOT)
|
||||
if not splade_db_path:
|
||||
return [], 0, "SPLADE database not found"
|
||||
|
||||
splade_index = SpladeIndex(splade_db_path)
|
||||
if not splade_index.has_index():
|
||||
return [], 0, "SPLADE index not initialized"
|
||||
|
||||
start = time.perf_counter()
|
||||
encoder = get_splade_encoder()
|
||||
query_sparse = encoder.encode_text(query)
|
||||
raw_results = splade_index.search(query_sparse, limit=limit, min_score=0.0)
|
||||
|
||||
if not raw_results:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
return [], elapsed, None
|
||||
|
||||
# Get chunk details
|
||||
chunk_ids = [chunk_id for chunk_id, _ in raw_results]
|
||||
score_map = {chunk_id: score for chunk_id, score in raw_results}
|
||||
rows = splade_index.get_chunks_by_ids(chunk_ids)
|
||||
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
|
||||
# Build result objects
|
||||
results = []
|
||||
for row in rows:
|
||||
chunk_id = row["id"]
|
||||
results.append({
|
||||
"path": row["file_path"],
|
||||
"score": score_map.get(chunk_id, 0.0),
|
||||
"content": row["content"][:200] + "..." if len(row["content"]) > 200 else row["content"],
|
||||
})
|
||||
|
||||
# Sort by score
|
||||
results.sort(key=lambda x: x["score"], reverse=True)
|
||||
return results, elapsed, None
|
||||
except Exception as e:
|
||||
return [], 0, str(e)
|
||||
|
||||
|
||||
def test_binary_cascade_search(query: str, limit: int = 10):
|
||||
"""Test binary cascade search (binary coarse + dense fine ranking)."""
|
||||
@@ -336,16 +272,13 @@ def compare_overlap(results1, results2, name1: str, name2: str):
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("SEMANTIC SEARCH METHODS COMPARISON")
|
||||
print("Binary Cascade vs SPLADE vs Vector Dense")
|
||||
print("Binary Cascade vs Vector Dense")
|
||||
print("=" * 70)
|
||||
|
||||
# Check prerequisites
|
||||
print("\n[Prerequisites Check]")
|
||||
print(f" Index Root: {INDEX_ROOT}")
|
||||
|
||||
splade_db = find_splade_db(INDEX_ROOT)
|
||||
print(f" SPLADE DB: {splade_db} - {'EXISTS' if splade_db else 'NOT FOUND'}")
|
||||
|
||||
binary_indexes = find_binary_indexes(INDEX_ROOT)
|
||||
print(f" Binary Indexes: {len(binary_indexes)} found")
|
||||
for bi in binary_indexes[:3]:
|
||||
@@ -356,11 +289,10 @@ def main():
|
||||
# Aggregate statistics
|
||||
all_results = {
|
||||
"binary": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||
"splade": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||
"vector": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||
}
|
||||
|
||||
overlap_scores = {"binary_splade": [], "binary_vector": [], "splade_vector": []}
|
||||
overlap_scores = {"binary_vector": []}
|
||||
|
||||
for query in TEST_QUERIES:
|
||||
print(f"\n{'#'*70}")
|
||||
@@ -369,12 +301,10 @@ def main():
|
||||
|
||||
# Test each method
|
||||
binary_results, binary_time, binary_err = test_binary_cascade_search(query)
|
||||
splade_results, splade_time, splade_err = test_splade_search(query)
|
||||
vector_results, vector_time, vector_err = test_vector_search(query)
|
||||
|
||||
# Print results
|
||||
print_results("Binary Cascade (256-bit + Dense Rerank)", binary_results, binary_time, binary_err)
|
||||
print_results("SPLADE (Sparse Learned)", splade_results, splade_time, splade_err)
|
||||
print_results("Vector Dense (Semantic Embeddings)", vector_results, vector_time, vector_err)
|
||||
|
||||
# Update statistics
|
||||
@@ -385,13 +315,6 @@ def main():
|
||||
else:
|
||||
all_results["binary"]["errors"].append(binary_err)
|
||||
|
||||
if not splade_err:
|
||||
all_results["splade"]["total_results"] += len(splade_results)
|
||||
all_results["splade"]["total_time"] += splade_time
|
||||
all_results["splade"]["queries"] += 1
|
||||
else:
|
||||
all_results["splade"]["errors"].append(splade_err)
|
||||
|
||||
if not vector_err:
|
||||
all_results["vector"]["total_results"] += len(vector_results)
|
||||
all_results["vector"]["total_time"] += vector_time
|
||||
@@ -401,15 +324,9 @@ def main():
|
||||
|
||||
# Compare overlap
|
||||
print("\n[Result Overlap Analysis]")
|
||||
if binary_results and splade_results:
|
||||
j = compare_overlap(binary_results, splade_results, "Binary", "SPLADE")
|
||||
overlap_scores["binary_splade"].append(j)
|
||||
if binary_results and vector_results:
|
||||
j = compare_overlap(binary_results, vector_results, "Binary", "Vector")
|
||||
overlap_scores["binary_vector"].append(j)
|
||||
if splade_results and vector_results:
|
||||
j = compare_overlap(splade_results, vector_results, "SPLADE", "Vector")
|
||||
overlap_scores["splade_vector"].append(j)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 70)
|
||||
@@ -447,13 +364,13 @@ def main():
|
||||
# Analyze working methods
|
||||
working_methods = [m for m, s in all_results.items() if s["queries"] > 0]
|
||||
|
||||
if len(working_methods) == 3:
|
||||
if len(working_methods) == 2:
|
||||
# All methods working - compare quality
|
||||
print("\nAll three methods working. Quality comparison:")
|
||||
print("\nBoth methods working. Quality comparison:")
|
||||
|
||||
# Compare avg results
|
||||
print("\n Result Coverage (higher = more recall):")
|
||||
for m in ["vector", "splade", "binary"]:
|
||||
for m in ["vector", "binary"]:
|
||||
stats = all_results[m]
|
||||
if stats["queries"] > 0:
|
||||
avg = stats["total_results"] / stats["queries"]
|
||||
@@ -461,7 +378,7 @@ def main():
|
||||
|
||||
# Compare speed
|
||||
print("\n Speed (lower = faster):")
|
||||
for m in ["binary", "splade", "vector"]:
|
||||
for m in ["binary", "vector"]:
|
||||
stats = all_results[m]
|
||||
if stats["queries"] > 0:
|
||||
avg = stats["total_time"] / stats["queries"]
|
||||
@@ -470,11 +387,10 @@ def main():
|
||||
# Recommend fusion strategy
|
||||
print("\n Recommended Fusion Strategy:")
|
||||
print(" For quality-focused hybrid search:")
|
||||
print(" 1. Run all three in parallel")
|
||||
print(" 1. Run both methods in parallel")
|
||||
print(" 2. Use RRF fusion with weights:")
|
||||
print(" - Vector: 0.4 (best semantic understanding)")
|
||||
print(" - SPLADE: 0.35 (learned sparse representations)")
|
||||
print(" - Binary: 0.25 (fast coarse filtering)")
|
||||
print(" - Vector: 0.6 (best semantic understanding)")
|
||||
print(" - Binary: 0.4 (fast coarse filtering)")
|
||||
print(" 3. Apply CrossEncoder reranking on top-50")
|
||||
|
||||
elif len(working_methods) >= 2:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Analysis script for hybrid search method contribution and storage architecture.
|
||||
|
||||
This script analyzes:
|
||||
1. Individual method contribution in hybrid search (FTS/SPLADE/Vector)
|
||||
1. Individual method contribution in hybrid search (FTS/Vector)
|
||||
2. Storage architecture conflicts between different retrieval methods
|
||||
3. FTS + Rerank fusion experiment
|
||||
"""
|
||||
@@ -24,9 +24,7 @@ from codexlens.search.ranking import (
|
||||
reciprocal_rank_fusion,
|
||||
cross_encoder_rerank,
|
||||
DEFAULT_WEIGHTS,
|
||||
FTS_FALLBACK_WEIGHTS,
|
||||
)
|
||||
from codexlens.search.hybrid_search import THREE_WAY_WEIGHTS
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
@@ -117,15 +115,7 @@ def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]:
|
||||
"binary cascade search."
|
||||
)
|
||||
|
||||
# 2. Check SPLADE index status
|
||||
if "splade_posting_list" in tables:
|
||||
splade_count = results["tables"]["splade_posting_list"]["row_count"]
|
||||
if splade_count == 0:
|
||||
results["recommendations"].append(
|
||||
"SPLADE tables exist but empty. Run SPLADE indexing to enable sparse retrieval."
|
||||
)
|
||||
|
||||
# 3. Check FTS tables
|
||||
# 2. Check FTS tables
|
||||
fts_tables = [t for t in tables if t.startswith("files_fts")]
|
||||
if len(fts_tables) >= 2:
|
||||
results["recommendations"].append(
|
||||
@@ -163,10 +153,9 @@ def analyze_method_contributions(
|
||||
|
||||
# Run each method independently
|
||||
methods = {
|
||||
"fts_exact": {"fuzzy": False, "vector": False, "splade": False},
|
||||
"fts_fuzzy": {"fuzzy": True, "vector": False, "splade": False},
|
||||
"vector": {"fuzzy": False, "vector": True, "splade": False},
|
||||
"splade": {"fuzzy": False, "vector": False, "splade": True},
|
||||
"fts_exact": {"fuzzy": False, "vector": False},
|
||||
"fts_fuzzy": {"fuzzy": True, "vector": False},
|
||||
"vector": {"fuzzy": False, "vector": True},
|
||||
}
|
||||
|
||||
method_results: Dict[str, List[SearchResult]] = {}
|
||||
@@ -178,7 +167,6 @@ def analyze_method_contributions(
|
||||
# Set config to disable/enable specific backends
|
||||
engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': method_name.startswith("fts"),
|
||||
'enable_splade': method_name == "splade",
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
@@ -186,13 +174,13 @@ def analyze_method_contributions(
|
||||
|
||||
if method_name == "fts_exact":
|
||||
# Force FTS fallback mode with fuzzy disabled
|
||||
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||
)
|
||||
elif method_name == "fts_fuzzy":
|
||||
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=True, enable_vector=False, pure_vector=False
|
||||
@@ -202,12 +190,6 @@ def analyze_method_contributions(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=True, pure_vector=True
|
||||
)
|
||||
elif method_name == "splade":
|
||||
engine.weights = {"splade": 1.0}
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||
)
|
||||
else:
|
||||
results_list = []
|
||||
|
||||
@@ -259,7 +241,7 @@ def analyze_method_contributions(
|
||||
# Compute RRF with each method's contribution
|
||||
rrf_map = {}
|
||||
for name, results in method_results.items():
|
||||
if results and name in ["fts_exact", "splade", "vector"]:
|
||||
if results and name in ["fts_exact", "vector"]:
|
||||
# Rename for RRF
|
||||
rrf_name = name.replace("fts_exact", "exact")
|
||||
rrf_map[rrf_name] = results
|
||||
@@ -310,7 +292,7 @@ def experiment_fts_rerank_fusion(
|
||||
"""Experiment: FTS + Rerank fusion vs standard hybrid.
|
||||
|
||||
Compares:
|
||||
1. Standard Hybrid (SPLADE + Vector RRF)
|
||||
1. Standard Hybrid (FTS + Vector RRF)
|
||||
2. FTS + CrossEncoder Rerank -> then fuse with Vector
|
||||
"""
|
||||
results = {
|
||||
@@ -336,11 +318,10 @@ def experiment_fts_rerank_fusion(
|
||||
"strategies": {}
|
||||
}
|
||||
|
||||
# Strategy 1: Standard Hybrid (SPLADE + Vector)
|
||||
# Strategy 1: Standard Hybrid (FTS + Vector)
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type('obj', (object,), {
|
||||
'enable_splade': True,
|
||||
'use_fts_fallback': False,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
@@ -364,10 +345,9 @@ def experiment_fts_rerank_fusion(
|
||||
# Strategy 2: FTS + Rerank -> Fuse with Vector
|
||||
try:
|
||||
# Step 1: Get FTS results (coarse)
|
||||
fts_engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||
fts_engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
fts_engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': True,
|
||||
'enable_splade': False,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user