mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
refactor: 移除 SPLADE 和 hybrid_cascade,精简搜索架构
删除 SPLADE 稀疏神经搜索后端和 hybrid_cascade 策略,
将搜索架构从 6 种后端简化为 4 种(FTS Exact/Fuzzy, Binary Vector, Dense Vector, LSP)。
主要变更:
- 删除 splade_encoder.py, splade_index.py, migration_009 等 4 个文件
- 移除 config.py 中 SPLADE 相关配置(enable_splade, splade_model 等)
- DEFAULT_WEIGHTS 改为 FTS 权重 {exact:0.25, fuzzy:0.1, vector:0.5, lsp:0.15}
- 删除 hybrid_cascade_search(),所有 cascade fallback 改为 self.search()
- API fusion_strategy='hybrid' 向后兼容映射到 binary_rerank
- 删除 CLI index_splade/splade_status 命令和 --method splade
- 更新测试、基准测试和文档
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
"""Analysis script for hybrid search method contribution and storage architecture.
|
||||
|
||||
This script analyzes:
|
||||
1. Individual method contribution in hybrid search (FTS/SPLADE/Vector)
|
||||
1. Individual method contribution in hybrid search (FTS/Vector)
|
||||
2. Storage architecture conflicts between different retrieval methods
|
||||
3. FTS + Rerank fusion experiment
|
||||
"""
|
||||
@@ -24,9 +24,7 @@ from codexlens.search.ranking import (
|
||||
reciprocal_rank_fusion,
|
||||
cross_encoder_rerank,
|
||||
DEFAULT_WEIGHTS,
|
||||
FTS_FALLBACK_WEIGHTS,
|
||||
)
|
||||
from codexlens.search.hybrid_search import THREE_WAY_WEIGHTS
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
@@ -117,15 +115,7 @@ def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]:
|
||||
"binary cascade search."
|
||||
)
|
||||
|
||||
# 2. Check SPLADE index status
|
||||
if "splade_posting_list" in tables:
|
||||
splade_count = results["tables"]["splade_posting_list"]["row_count"]
|
||||
if splade_count == 0:
|
||||
results["recommendations"].append(
|
||||
"SPLADE tables exist but empty. Run SPLADE indexing to enable sparse retrieval."
|
||||
)
|
||||
|
||||
# 3. Check FTS tables
|
||||
# 2. Check FTS tables
|
||||
fts_tables = [t for t in tables if t.startswith("files_fts")]
|
||||
if len(fts_tables) >= 2:
|
||||
results["recommendations"].append(
|
||||
@@ -163,10 +153,9 @@ def analyze_method_contributions(
|
||||
|
||||
# Run each method independently
|
||||
methods = {
|
||||
"fts_exact": {"fuzzy": False, "vector": False, "splade": False},
|
||||
"fts_fuzzy": {"fuzzy": True, "vector": False, "splade": False},
|
||||
"vector": {"fuzzy": False, "vector": True, "splade": False},
|
||||
"splade": {"fuzzy": False, "vector": False, "splade": True},
|
||||
"fts_exact": {"fuzzy": False, "vector": False},
|
||||
"fts_fuzzy": {"fuzzy": True, "vector": False},
|
||||
"vector": {"fuzzy": False, "vector": True},
|
||||
}
|
||||
|
||||
method_results: Dict[str, List[SearchResult]] = {}
|
||||
@@ -178,7 +167,6 @@ def analyze_method_contributions(
|
||||
# Set config to disable/enable specific backends
|
||||
engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': method_name.startswith("fts"),
|
||||
'enable_splade': method_name == "splade",
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
@@ -186,13 +174,13 @@ def analyze_method_contributions(
|
||||
|
||||
if method_name == "fts_exact":
|
||||
# Force FTS fallback mode with fuzzy disabled
|
||||
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||
)
|
||||
elif method_name == "fts_fuzzy":
|
||||
engine.weights = FTS_FALLBACK_WEIGHTS.copy()
|
||||
engine.weights = DEFAULT_WEIGHTS.copy()
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=True, enable_vector=False, pure_vector=False
|
||||
@@ -202,12 +190,6 @@ def analyze_method_contributions(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=True, pure_vector=True
|
||||
)
|
||||
elif method_name == "splade":
|
||||
engine.weights = {"splade": 1.0}
|
||||
results_list = engine.search(
|
||||
index_path, query, limit=limit,
|
||||
enable_fuzzy=False, enable_vector=False, pure_vector=False
|
||||
)
|
||||
else:
|
||||
results_list = []
|
||||
|
||||
@@ -259,7 +241,7 @@ def analyze_method_contributions(
|
||||
# Compute RRF with each method's contribution
|
||||
rrf_map = {}
|
||||
for name, results in method_results.items():
|
||||
if results and name in ["fts_exact", "splade", "vector"]:
|
||||
if results and name in ["fts_exact", "vector"]:
|
||||
# Rename for RRF
|
||||
rrf_name = name.replace("fts_exact", "exact")
|
||||
rrf_map[rrf_name] = results
|
||||
@@ -310,7 +292,7 @@ def experiment_fts_rerank_fusion(
|
||||
"""Experiment: FTS + Rerank fusion vs standard hybrid.
|
||||
|
||||
Compares:
|
||||
1. Standard Hybrid (SPLADE + Vector RRF)
|
||||
1. Standard Hybrid (FTS + Vector RRF)
|
||||
2. FTS + CrossEncoder Rerank -> then fuse with Vector
|
||||
"""
|
||||
results = {
|
||||
@@ -336,11 +318,10 @@ def experiment_fts_rerank_fusion(
|
||||
"strategies": {}
|
||||
}
|
||||
|
||||
# Strategy 1: Standard Hybrid (SPLADE + Vector)
|
||||
# Strategy 1: Standard Hybrid (FTS + Vector)
|
||||
try:
|
||||
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
engine._config = type('obj', (object,), {
|
||||
'enable_splade': True,
|
||||
'use_fts_fallback': False,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
@@ -364,10 +345,9 @@ def experiment_fts_rerank_fusion(
|
||||
# Strategy 2: FTS + Rerank -> Fuse with Vector
|
||||
try:
|
||||
# Step 1: Get FTS results (coarse)
|
||||
fts_engine = HybridSearchEngine(weights=FTS_FALLBACK_WEIGHTS)
|
||||
fts_engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
|
||||
fts_engine._config = type('obj', (object,), {
|
||||
'use_fts_fallback': True,
|
||||
'enable_splade': False,
|
||||
'embedding_use_gpu': True,
|
||||
})()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user