mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-06 16:31:12 +08:00
refactor: 移除 SPLADE 和 hybrid_cascade,精简搜索架构
删除 SPLADE 稀疏神经搜索后端和 hybrid_cascade 策略,
将搜索架构从 6 种后端简化为 4 种(FTS Exact/Fuzzy, Binary Vector, Dense Vector, LSP)。
主要变更:
- 删除 splade_encoder.py, splade_index.py, migration_009 等 4 个文件
- 移除 config.py 中 SPLADE 相关配置(enable_splade, splade_model 等)
- DEFAULT_WEIGHTS 改为 FTS 权重 {exact:0.25, fuzzy:0.1, vector:0.5, lsp:0.15}
- 删除 hybrid_cascade_search(),所有 cascade fallback 改为 self.search()
- API fusion_strategy='hybrid' 向后兼容映射到 binary_rerank
- 删除 CLI index_splade/splade_status 命令和 --method splade
- 更新测试、基准测试和文档
This commit is contained in:
@@ -151,15 +151,6 @@ def _cleanup_fastembed_resources() -> None:
|
||||
pass
|
||||
|
||||
|
||||
def _cleanup_splade_resources() -> None:
|
||||
"""Release SPLADE encoder ONNX resources."""
|
||||
try:
|
||||
from codexlens.semantic.splade_encoder import clear_splade_cache
|
||||
clear_splade_cache()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _generate_chunks_from_cursor(
|
||||
cursor,
|
||||
chunker,
|
||||
@@ -398,7 +389,6 @@ def generate_embeddings(
|
||||
endpoints: Optional[List] = None,
|
||||
strategy: Optional[str] = None,
|
||||
cooldown: Optional[float] = None,
|
||||
splade_db_path: Optional[Path] = None,
|
||||
) -> Dict[str, any]:
|
||||
"""Generate embeddings for an index using memory-efficient batch processing.
|
||||
|
||||
@@ -428,9 +418,6 @@ def generate_embeddings(
|
||||
Each dict has keys: model, api_key, api_base, weight.
|
||||
strategy: Selection strategy for multi-endpoint mode (round_robin, latency_aware).
|
||||
cooldown: Default cooldown seconds for rate-limited endpoints.
|
||||
splade_db_path: Optional path to centralized SPLADE database. If None, SPLADE
|
||||
is written to index_path (legacy behavior). Use index_root / SPLADE_DB_NAME
|
||||
for centralized storage.
|
||||
|
||||
Returns:
|
||||
Result dictionary with generation statistics
|
||||
@@ -822,97 +809,10 @@ def generate_embeddings(
|
||||
if progress_callback:
|
||||
progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks")
|
||||
|
||||
# --- SPLADE SPARSE ENCODING (after dense embeddings) ---
|
||||
# Add SPLADE encoding if enabled in config
|
||||
splade_success = False
|
||||
splade_error = None
|
||||
|
||||
try:
|
||||
from codexlens.config import Config, SPLADE_DB_NAME
|
||||
config = Config.load()
|
||||
|
||||
if config.enable_splade:
|
||||
from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder
|
||||
from codexlens.storage.splade_index import SpladeIndex
|
||||
|
||||
ok, err = check_splade_available()
|
||||
if ok:
|
||||
if progress_callback:
|
||||
progress_callback(f"Generating SPLADE sparse vectors for {total_chunks_created} chunks...")
|
||||
|
||||
# Initialize SPLADE encoder and index
|
||||
splade_encoder = get_splade_encoder(use_gpu=use_gpu)
|
||||
# Use centralized SPLADE database if provided, otherwise fallback to index_path
|
||||
effective_splade_path = splade_db_path if splade_db_path else index_path
|
||||
splade_index = SpladeIndex(effective_splade_path)
|
||||
splade_index.create_tables()
|
||||
|
||||
# Retrieve all chunks from database for SPLADE encoding
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT id, content FROM semantic_chunks ORDER BY id")
|
||||
|
||||
# Batch encode for efficiency
|
||||
SPLADE_BATCH_SIZE = 32
|
||||
batch_postings = []
|
||||
chunk_batch = []
|
||||
chunk_ids = []
|
||||
|
||||
for row in cursor:
|
||||
chunk_id = row["id"]
|
||||
content = row["content"]
|
||||
|
||||
chunk_ids.append(chunk_id)
|
||||
chunk_batch.append(content)
|
||||
|
||||
# Process batch when full
|
||||
if len(chunk_batch) >= SPLADE_BATCH_SIZE:
|
||||
sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE)
|
||||
for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
|
||||
batch_postings.append((cid, sparse_vec))
|
||||
|
||||
chunk_batch = []
|
||||
chunk_ids = []
|
||||
|
||||
# Process remaining chunks
|
||||
if chunk_batch:
|
||||
sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE)
|
||||
for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
|
||||
batch_postings.append((cid, sparse_vec))
|
||||
|
||||
# Batch insert all postings
|
||||
if batch_postings:
|
||||
splade_index.add_postings_batch(batch_postings)
|
||||
|
||||
# Set metadata
|
||||
splade_index.set_metadata(
|
||||
model_name=splade_encoder.model_name,
|
||||
vocab_size=splade_encoder.vocab_size
|
||||
)
|
||||
|
||||
splade_success = True
|
||||
if progress_callback:
|
||||
stats = splade_index.get_stats()
|
||||
progress_callback(
|
||||
f"SPLADE index created: {stats['total_postings']} postings, "
|
||||
f"{stats['unique_tokens']} unique tokens"
|
||||
)
|
||||
else:
|
||||
logger.debug("SPLADE not available: %s", err)
|
||||
splade_error = f"SPLADE not available: {err}"
|
||||
except Exception as e:
|
||||
splade_error = str(e)
|
||||
logger.warning("SPLADE encoding failed: %s", e)
|
||||
|
||||
# Report SPLADE status after processing
|
||||
if progress_callback and not splade_success and splade_error:
|
||||
progress_callback(f"SPLADE index: FAILED - {splade_error}")
|
||||
|
||||
except Exception as e:
|
||||
# Cleanup on error to prevent process hanging
|
||||
try:
|
||||
_cleanup_fastembed_resources()
|
||||
_cleanup_splade_resources()
|
||||
gc.collect()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -924,7 +824,6 @@ def generate_embeddings(
|
||||
# This is critical - without it, ONNX Runtime threads prevent Python from exiting
|
||||
try:
|
||||
_cleanup_fastembed_resources()
|
||||
_cleanup_splade_resources()
|
||||
gc.collect()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1098,10 +997,6 @@ def generate_embeddings_recursive(
|
||||
if progress_callback:
|
||||
progress_callback(f"Found {len(index_files)} index databases to process")
|
||||
|
||||
# Calculate centralized SPLADE database path
|
||||
from codexlens.config import SPLADE_DB_NAME
|
||||
splade_db_path = index_root / SPLADE_DB_NAME
|
||||
|
||||
# Process each index database
|
||||
all_results = []
|
||||
total_chunks = 0
|
||||
@@ -1131,7 +1026,6 @@ def generate_embeddings_recursive(
|
||||
endpoints=endpoints,
|
||||
strategy=strategy,
|
||||
cooldown=cooldown,
|
||||
splade_db_path=splade_db_path, # Use centralized SPLADE storage
|
||||
)
|
||||
|
||||
all_results.append({
|
||||
@@ -1153,7 +1047,6 @@ def generate_embeddings_recursive(
|
||||
# Each generate_embeddings() call does its own cleanup, but do a final one to be safe
|
||||
try:
|
||||
_cleanup_fastembed_resources()
|
||||
_cleanup_splade_resources()
|
||||
gc.collect()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1197,7 +1090,6 @@ def generate_dense_embeddings_centralized(
|
||||
Target architecture:
|
||||
<index_root>/
|
||||
|-- _vectors.hnsw # Centralized dense vector ANN index
|
||||
|-- _splade.db # Centralized sparse vector index
|
||||
|-- src/
|
||||
|-- _index.db # No longer contains .hnsw file
|
||||
|
||||
@@ -1219,7 +1111,7 @@ def generate_dense_embeddings_centralized(
|
||||
Returns:
|
||||
Result dictionary with generation statistics
|
||||
"""
|
||||
from codexlens.config import VECTORS_HNSW_NAME, SPLADE_DB_NAME
|
||||
from codexlens.config import VECTORS_HNSW_NAME
|
||||
|
||||
# Get defaults from config if not specified
|
||||
(default_backend, default_model, default_gpu,
|
||||
@@ -1543,90 +1435,6 @@ def generate_dense_embeddings_centralized(
|
||||
logger.warning("Binary vector generation failed: %s", e)
|
||||
# Non-fatal: continue without binary vectors
|
||||
|
||||
# --- SPLADE Sparse Index Generation (Centralized) ---
|
||||
splade_success = False
|
||||
splade_chunks_count = 0
|
||||
try:
|
||||
from codexlens.config import Config
|
||||
config = Config.load()
|
||||
|
||||
if config.enable_splade and chunk_id_to_info:
|
||||
from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder
|
||||
from codexlens.storage.splade_index import SpladeIndex
|
||||
import json
|
||||
|
||||
ok, err = check_splade_available()
|
||||
if ok:
|
||||
if progress_callback:
|
||||
progress_callback(f"Generating SPLADE sparse vectors for {len(chunk_id_to_info)} chunks...")
|
||||
|
||||
# Initialize SPLADE encoder and index
|
||||
splade_encoder = get_splade_encoder(use_gpu=use_gpu)
|
||||
splade_db_path = index_root / SPLADE_DB_NAME
|
||||
splade_index = SpladeIndex(splade_db_path)
|
||||
splade_index.create_tables()
|
||||
|
||||
# Batch encode for efficiency
|
||||
SPLADE_BATCH_SIZE = 32
|
||||
all_postings = []
|
||||
all_chunk_metadata = []
|
||||
|
||||
# Create batches from chunk_id_to_info
|
||||
chunk_items = list(chunk_id_to_info.items())
|
||||
|
||||
for i in range(0, len(chunk_items), SPLADE_BATCH_SIZE):
|
||||
batch_items = chunk_items[i:i + SPLADE_BATCH_SIZE]
|
||||
chunk_ids = [item[0] for item in batch_items]
|
||||
chunk_contents = [item[1]["content"] for item in batch_items]
|
||||
|
||||
# Generate sparse vectors
|
||||
sparse_vecs = splade_encoder.encode_batch(chunk_contents, batch_size=SPLADE_BATCH_SIZE)
|
||||
for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
|
||||
all_postings.append((cid, sparse_vec))
|
||||
|
||||
if progress_callback and (i + SPLADE_BATCH_SIZE) % 100 == 0:
|
||||
progress_callback(f"SPLADE encoding: {min(i + SPLADE_BATCH_SIZE, len(chunk_items))}/{len(chunk_items)}")
|
||||
|
||||
# Batch insert all postings
|
||||
if all_postings:
|
||||
splade_index.add_postings_batch(all_postings)
|
||||
|
||||
# CRITICAL FIX: Populate splade_chunks table
|
||||
for cid, info in chunk_id_to_info.items():
|
||||
metadata_str = json.dumps(info.get("metadata", {})) if info.get("metadata") else None
|
||||
all_chunk_metadata.append((
|
||||
cid,
|
||||
info["file_path"],
|
||||
info["content"],
|
||||
metadata_str,
|
||||
info.get("source_index_db")
|
||||
))
|
||||
|
||||
if all_chunk_metadata:
|
||||
splade_index.add_chunks_metadata_batch(all_chunk_metadata)
|
||||
splade_chunks_count = len(all_chunk_metadata)
|
||||
|
||||
# Set metadata
|
||||
splade_index.set_metadata(
|
||||
model_name=splade_encoder.model_name,
|
||||
vocab_size=splade_encoder.vocab_size
|
||||
)
|
||||
|
||||
splade_index.close()
|
||||
splade_success = True
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"SPLADE index created: {len(all_postings)} postings, {splade_chunks_count} chunks")
|
||||
|
||||
else:
|
||||
if progress_callback:
|
||||
progress_callback(f"SPLADE not available, skipping sparse index: {err}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("SPLADE encoding failed: %s", e)
|
||||
if progress_callback:
|
||||
progress_callback(f"SPLADE encoding failed: {e}")
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Cleanup
|
||||
@@ -1647,8 +1455,6 @@ def generate_dense_embeddings_centralized(
|
||||
"model_name": embedder.model_name,
|
||||
"central_index_path": str(central_hnsw_path),
|
||||
"failed_files": failed_files[:5],
|
||||
"splade_success": splade_success,
|
||||
"splade_chunks": splade_chunks_count,
|
||||
"binary_success": binary_success,
|
||||
"binary_count": binary_count,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user