refactor: 移除 SPLADE 和 hybrid_cascade,精简搜索架构

删除 SPLADE 稀疏神经搜索后端和 hybrid_cascade 策略,
将搜索架构从 6 种后端简化为 4 种(FTS Exact/Fuzzy, Binary Vector, Dense Vector, LSP)。

主要变更:
- 删除 splade_encoder.py, splade_index.py, migration_009 等 4 个文件
- 移除 config.py 中 SPLADE 相关配置(enable_splade, splade_model 等)
- DEFAULT_WEIGHTS 改为 FTS 权重 {exact:0.25, fuzzy:0.1, vector:0.5, lsp:0.15}
- 删除 hybrid_cascade_search(),所有 cascade fallback 改为 self.search()
- API fusion_strategy='hybrid' 向后兼容映射到 binary_rerank
- 删除 CLI index_splade/splade_status 命令和 --method splade
- 更新测试、基准测试和文档
This commit is contained in:
catlog22
2026-02-08 12:07:41 +08:00
parent 72d2ae750b
commit 71faaf43a8
22 changed files with 126 additions and 2883 deletions

View File

@@ -19,9 +19,6 @@ WORKSPACE_DIR_NAME = ".codexlens"
# Settings file name
SETTINGS_FILE_NAME = "settings.json"
# SPLADE index database name (centralized storage)
SPLADE_DB_NAME = "_splade.db"
# Dense vector storage names (centralized storage)
VECTORS_HNSW_NAME = "_vectors.hnsw"
VECTORS_META_DB_NAME = "_vectors_meta.db"
@@ -113,15 +110,6 @@ class Config:
# For litellm: model name from config (e.g., "qwen3-embedding")
embedding_use_gpu: bool = True # For fastembed: whether to use GPU acceleration
# SPLADE sparse retrieval configuration
enable_splade: bool = False # Disable SPLADE by default (slow ~360ms, use FTS instead)
splade_model: str = "naver/splade-cocondenser-ensembledistil"
splade_threshold: float = 0.01 # Min weight to store in index
splade_onnx_path: Optional[str] = None # Custom ONNX model path
# FTS fallback (disabled by default, available via --use-fts)
use_fts_fallback: bool = True # Use FTS for sparse search (fast, SPLADE disabled)
# Indexing/search optimizations
global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path
enable_merkle_detection: bool = True # Enable content-hash based incremental indexing
@@ -152,7 +140,7 @@ class Config:
enable_cascade_search: bool = False # Enable cascade search (coarse + fine ranking)
cascade_coarse_k: int = 100 # Number of coarse candidates from first stage
cascade_fine_k: int = 10 # Number of final results after reranking
cascade_strategy: str = "binary" # "binary" (fast binary+dense) or "hybrid" (FTS+SPLADE+Vector+CrossEncoder)
cascade_strategy: str = "binary" # "binary", "binary_rerank", "dense_rerank", or "staged"
# Staged cascade search configuration (4-stage pipeline)
staged_coarse_k: int = 200 # Number of coarse candidates from Stage 1 binary search
@@ -398,11 +386,11 @@ class Config:
cascade = settings.get("cascade", {})
if "strategy" in cascade:
strategy = cascade["strategy"]
if strategy in {"binary", "hybrid", "binary_rerank", "dense_rerank"}:
if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}:
self.cascade_strategy = strategy
else:
log.warning(
"Invalid cascade strategy in %s: %r (expected 'binary', 'hybrid', 'binary_rerank', or 'dense_rerank')",
"Invalid cascade strategy in %s: %r (expected 'binary', 'binary_rerank', 'dense_rerank', or 'staged')",
self.settings_path,
strategy,
)