Implement SPLADE sparse encoder and associated database migrations

- Added `splade_encoder.py` for ONNX-optimized SPLADE encoding, including methods for encoding text and batch processing. - Created `SPLADE_IMPLEMENTATION.md` to document the SPLADE encoder's functionality, design patterns, and integration points. - Introduced migration script `migration_009_add_splade.py` to add SPLADE metadata and posting list tables to the database. - Developed `splade_index.py` for managing the SPLADE inverted index, supporting efficient sparse vector retrieval. - Added verification script `verify_watcher.py` to test FileWatcher event filtering and debouncing functionality.
2026-02-10 02:24:35 +08:00 · 2026-01-01 17:41:22 +08:00
parent 520f2d26f2
commit 5bb01755bc
16 changed files with 3122 additions and 2792 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -415,11 +415,20 @@ def search(
    depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
    mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."),
-    weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
+    weights: Optional[str] = typer.Option(
+        None,
+        "--weights", "-w",
+        help="RRF weights as key=value pairs (e.g., 'splade=0.4,vector=0.6' or 'exact=0.3,fuzzy=0.1,vector=0.6'). Default: auto-detect based on available backends."
+    ),
+    use_fts: bool = typer.Option(
+        False,
+        "--use-fts",
+        help="Use FTS (exact+fuzzy) instead of SPLADE for sparse retrieval"
+    ),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
 ) -> None:
-    """Search indexed file contents using SQLite FTS5 or semantic vectors.
+    """Search indexed file contents using hybrid retrieval.

    Uses chain search across directory indexes.
    Use --depth to limit search recursion (0 = current dir only).
@@ -428,17 +437,27 @@ def search(
      - auto: Auto-detect (hybrid if embeddings exist, exact otherwise) [default]
      - exact: Exact FTS using unicode61 tokenizer - for code identifiers
      - fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search
-      - hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall
-      - vector: Vector search with exact FTS fallback - semantic + keyword
+      - hybrid: RRF fusion of sparse + dense search (recommended) - best recall
+      - vector: Vector search with sparse fallback - semantic + keyword
      - pure-vector: Pure semantic vector search only - natural language queries

+    SPLADE Mode:
+      When SPLADE is available (pip install codex-lens[splade]), it automatically
+      replaces FTS (exact+fuzzy) as the sparse retrieval backend. SPLADE provides
+      semantic term expansion for better synonym handling.
+      
+      Use --use-fts to force FTS mode instead of SPLADE.
+
    Vector Search Requirements:
      Vector search modes require pre-generated embeddings.
      Use 'codexlens embeddings-generate' to create embeddings first.

-    Hybrid Mode:
-      Default weights: exact=0.3, fuzzy=0.1, vector=0.6
-      Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
+    Hybrid Mode Weights:
+      Use --weights to adjust RRF fusion weights:
+      - SPLADE mode: 'splade=0.4,vector=0.6' (default)
+      - FTS mode: 'exact=0.3,fuzzy=0.1,vector=0.6' (default)
+      
+      Legacy format also supported: '0.3,0.1,0.6' (exact,fuzzy,vector)

    Examples:
      # Auto-detect mode (uses hybrid if embeddings available)
@@ -450,11 +469,19 @@ def search(
      # Semantic search (requires embeddings)
      codexlens search "how to verify user credentials" --mode pure-vector

-      # Force hybrid mode
-      codexlens search "authentication" --mode hybrid
+      # Force hybrid mode with custom weights
+      codexlens search "authentication" --mode hybrid --weights splade=0.5,vector=0.5
+      
+      # Force FTS instead of SPLADE
+      codexlens search "authentication" --use-fts
    """
    _configure_logging(verbose, json_mode)
    search_path = path.expanduser().resolve()
+    
+    # Configure search with FTS fallback if requested
+    config = Config()
+    if use_fts:
+        config.use_fts_fallback = True

    # Validate mode
    valid_modes = ["auto", "exact", "fuzzy", "hybrid", "vector", "pure-vector"]
@@ -470,22 +497,56 @@ def search(
    hybrid_weights = None
    if weights:
        try:
-            weight_parts = [float(w.strip()) for w in weights.split(",")]
-            if len(weight_parts) == 3:
-                weight_sum = sum(weight_parts)
+            # Check if using key=value format (new) or legacy comma-separated format
+            if "=" in weights:
+                # New format: splade=0.4,vector=0.6 or exact=0.3,fuzzy=0.1,vector=0.6
+                weight_dict = {}
+                for pair in weights.split(","):
+                    if "=" in pair:
+                        key, val = pair.split("=", 1)
+                        weight_dict[key.strip()] = float(val.strip())
+                    else:
+                        raise ValueError("Mixed format not supported - use all key=value pairs")
+                
+                # Validate and normalize weights
+                weight_sum = sum(weight_dict.values())
                if abs(weight_sum - 1.0) > 0.01:
-                    console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
-                    # Normalize weights
-                    weight_parts = [w / weight_sum for w in weight_parts]
-                hybrid_weights = {
-                    "exact": weight_parts[0],
-                    "fuzzy": weight_parts[1],
-                    "vector": weight_parts[2],
-                }
+                    if not json_mode:
+                        console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
+                    weight_dict = {k: v / weight_sum for k, v in weight_dict.items()}
+                
+                hybrid_weights = weight_dict
            else:
-                console.print("[yellow]Warning: Invalid weights format (need 3 values). Using defaults.[/yellow]")
-        except ValueError:
-            console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
+                # Legacy format: 0.3,0.1,0.6 (exact,fuzzy,vector)
+                weight_parts = [float(w.strip()) for w in weights.split(",")]
+                if len(weight_parts) == 3:
+                    weight_sum = sum(weight_parts)
+                    if abs(weight_sum - 1.0) > 0.01:
+                        if not json_mode:
+                            console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
+                        weight_parts = [w / weight_sum for w in weight_parts]
+                    hybrid_weights = {
+                        "exact": weight_parts[0],
+                        "fuzzy": weight_parts[1],
+                        "vector": weight_parts[2],
+                    }
+                elif len(weight_parts) == 2:
+                    # Two values: assume splade,vector
+                    weight_sum = sum(weight_parts)
+                    if abs(weight_sum - 1.0) > 0.01:
+                        if not json_mode:
+                            console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
+                        weight_parts = [w / weight_sum for w in weight_parts]
+                    hybrid_weights = {
+                        "splade": weight_parts[0],
+                        "vector": weight_parts[1],
+                    }
+                else:
+                    if not json_mode:
+                        console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
+        except ValueError as e:
+            if not json_mode:
+                console.print(f"[yellow]Warning: Invalid weights format ({e}). Using defaults.[/yellow]")

    registry: RegistryStore | None = None
    try:
@@ -2381,6 +2442,188 @@ def gpu_reset(
            console.print(f"  Device: [cyan]{gpu_info.gpu_name}[/cyan]")


+
+# ==================== SPLADE Commands ====================
+
+@app.command("splade-index")
+def splade_index_command(
+    path: Path = typer.Argument(..., help="Project path to index"),
+    rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
+) -> None:
+    """Generate SPLADE sparse index for existing codebase.
+
+    Encodes all semantic chunks with SPLADE model and builds inverted index
+    for efficient sparse retrieval.
+
+    Examples:
+        codexlens splade-index ~/projects/my-app
+        codexlens splade-index . --rebuild
+    """
+    _configure_logging(verbose)
+
+    from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
+    from codexlens.storage.splade_index import SpladeIndex
+    from codexlens.semantic.vector_store import VectorStore
+
+    # Check SPLADE availability
+    ok, err = check_splade_available()
+    if not ok:
+        console.print(f"[red]SPLADE not available: {err}[/red]")
+        console.print("[dim]Install with: pip install transformers torch[/dim]")
+        raise typer.Exit(1)
+
+    # Find index database
+    target_path = path.expanduser().resolve()
+
+    # Try to find _index.db
+    if target_path.is_file() and target_path.name == "_index.db":
+        index_db = target_path
+    elif target_path.is_dir():
+        # Check for local .codexlens/_index.db
+        local_index = target_path / ".codexlens" / "_index.db"
+        if local_index.exists():
+            index_db = local_index
+        else:
+            # Try to find via registry
+            registry = RegistryStore()
+            try:
+                registry.initialize()
+                mapper = PathMapper()
+                index_db = mapper.source_to_index_db(target_path)
+                if not index_db.exists():
+                    console.print(f"[red]Error:[/red] No index found for {target_path}")
+                    console.print("Run 'codexlens init' first to create an index")
+                    raise typer.Exit(1)
+            finally:
+                registry.close()
+    else:
+        console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory")
+        raise typer.Exit(1)
+
+    splade_db = index_db.parent / "_splade.db"
+
+    if splade_db.exists() and not rebuild:
+        console.print("[yellow]SPLADE index exists. Use --rebuild to regenerate.[/yellow]")
+        return
+
+    # Load chunks from vector store
+    console.print(f"[blue]Loading chunks from {index_db.name}...[/blue]")
+    vector_store = VectorStore(index_db)
+    chunks = vector_store.get_all_chunks()
+
+    if not chunks:
+        console.print("[yellow]No chunks found in vector store[/yellow]")
+        console.print("[dim]Generate embeddings first with 'codexlens embeddings-generate'[/dim]")
+        raise typer.Exit(1)
+
+    console.print(f"[blue]Encoding {len(chunks)} chunks with SPLADE...[/blue]")
+
+    # Initialize SPLADE
+    encoder = get_splade_encoder()
+    splade_index = SpladeIndex(splade_db)
+    splade_index.create_tables()
+
+    # Encode in batches with progress bar
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        TimeElapsedColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Encoding...", total=len(chunks))
+        for chunk in chunks:
+            sparse_vec = encoder.encode_text(chunk.content)
+            splade_index.add_posting(chunk.id, sparse_vec)
+            progress.advance(task)
+
+    # Set metadata
+    splade_index.set_metadata(
+        model_name=encoder.model_name,
+        vocab_size=encoder.vocab_size
+    )
+
+    stats = splade_index.get_stats()
+    console.print(f"[green]✓[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings")
+    console.print(f"  Database: [dim]{splade_db}[/dim]")
+
+
+@app.command("splade-status")
+def splade_status_command(
+    path: Path = typer.Argument(..., help="Project path"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
+) -> None:
+    """Show SPLADE index status and statistics.
+
+    Examples:
+        codexlens splade-status ~/projects/my-app
+        codexlens splade-status .
+    """
+    _configure_logging(verbose)
+
+    from codexlens.storage.splade_index import SpladeIndex
+    from codexlens.semantic.splade_encoder import check_splade_available
+
+    # Find index database
+    target_path = path.expanduser().resolve()
+
+    if target_path.is_file() and target_path.name == "_index.db":
+        splade_db = target_path.parent / "_splade.db"
+    elif target_path.is_dir():
+        # Check for local .codexlens/_splade.db
+        local_splade = target_path / ".codexlens" / "_splade.db"
+        if local_splade.exists():
+            splade_db = local_splade
+        else:
+            # Try to find via registry
+            registry = RegistryStore()
+            try:
+                registry.initialize()
+                mapper = PathMapper()
+                index_db = mapper.source_to_index_db(target_path)
+                splade_db = index_db.parent / "_splade.db"
+            finally:
+                registry.close()
+    else:
+        console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory")
+        raise typer.Exit(1)
+
+    if not splade_db.exists():
+        console.print("[yellow]No SPLADE index found[/yellow]")
+        console.print(f"[dim]Run 'codexlens splade-index {path}' to create one[/dim]")
+        return
+
+    splade_index = SpladeIndex(splade_db)
+
+    if not splade_index.has_index():
+        console.print("[yellow]SPLADE tables not initialized[/yellow]")
+        return
+
+    metadata = splade_index.get_metadata()
+    stats = splade_index.get_stats()
+
+    # Create status table
+    table = Table(title="SPLADE Index Status", show_header=False)
+    table.add_column("Property", style="cyan")
+    table.add_column("Value")
+
+    table.add_row("Database", str(splade_db))
+    if metadata:
+        table.add_row("Model", metadata['model_name'])
+        table.add_row("Vocab Size", str(metadata['vocab_size']))
+    table.add_row("Chunks", str(stats['unique_chunks']))
+    table.add_row("Unique Tokens", str(stats['unique_tokens']))
+    table.add_row("Total Postings", str(stats['total_postings']))
+
+    ok, err = check_splade_available()
+    status_text = "[green]Yes[/green]" if ok else f"[red]No[/red] - {err}"
+    table.add_row("SPLADE Available", status_text)
+
+    console.print(table)
+
+
 # ==================== Watch Command ====================

@app.command()
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -33,6 +33,15 @@ def _cleanup_fastembed_resources() -> None:
        pass


+def _cleanup_splade_resources() -> None:
+    """Release SPLADE encoder ONNX resources."""
+    try:
+        from codexlens.semantic.splade_encoder import clear_splade_cache
+        clear_splade_cache()
+    except Exception:
+        pass
+
+
 def _generate_chunks_from_cursor(
    cursor,
    chunker,
@@ -675,10 +684,96 @@ def generate_embeddings(
                if progress_callback:
                    progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks")

+            # --- SPLADE SPARSE ENCODING (after dense embeddings) ---
+            # Add SPLADE encoding if enabled in config
+            splade_success = False
+            splade_error = None
+
+            try:
+                from codexlens.config import Config
+                config = Config.load()
+
+                if config.enable_splade:
+                    from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder
+                    from codexlens.storage.splade_index import SpladeIndex
+
+                    ok, err = check_splade_available()
+                    if ok:
+                        if progress_callback:
+                            progress_callback(f"Generating SPLADE sparse vectors for {total_chunks_created} chunks...")
+
+                        # Initialize SPLADE encoder and index
+                        splade_encoder = get_splade_encoder(use_gpu=use_gpu)
+                        # Use main index database for SPLADE (not separate _splade.db)
+                        splade_index = SpladeIndex(index_path)
+                        splade_index.create_tables()
+
+                        # Retrieve all chunks from database for SPLADE encoding
+                        with sqlite3.connect(index_path) as conn:
+                            conn.row_factory = sqlite3.Row
+                            cursor = conn.execute("SELECT id, content FROM semantic_chunks ORDER BY id")
+
+                            # Batch encode for efficiency
+                            SPLADE_BATCH_SIZE = 32
+                            batch_postings = []
+                            chunk_batch = []
+                            chunk_ids = []
+
+                            for row in cursor:
+                                chunk_id = row["id"]
+                                content = row["content"]
+
+                                chunk_ids.append(chunk_id)
+                                chunk_batch.append(content)
+
+                                # Process batch when full
+                                if len(chunk_batch) >= SPLADE_BATCH_SIZE:
+                                    sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE)
+                                    for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
+                                        batch_postings.append((cid, sparse_vec))
+
+                                    chunk_batch = []
+                                    chunk_ids = []
+
+                            # Process remaining chunks
+                            if chunk_batch:
+                                sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE)
+                                for cid, sparse_vec in zip(chunk_ids, sparse_vecs):
+                                    batch_postings.append((cid, sparse_vec))
+
+                            # Batch insert all postings
+                            if batch_postings:
+                                splade_index.add_postings_batch(batch_postings)
+
+                                # Set metadata
+                                splade_index.set_metadata(
+                                    model_name=splade_encoder.model_name,
+                                    vocab_size=splade_encoder.vocab_size
+                                )
+
+                                splade_success = True
+                                if progress_callback:
+                                    stats = splade_index.get_stats()
+                                    progress_callback(
+                                        f"SPLADE index created: {stats['total_postings']} postings, "
+                                        f"{stats['unique_tokens']} unique tokens"
+                                    )
+                    else:
+                        logger.debug("SPLADE not available: %s", err)
+                        splade_error = f"SPLADE not available: {err}"
+            except Exception as e:
+                splade_error = str(e)
+                logger.warning("SPLADE encoding failed: %s", e)
+
+            # Report SPLADE status after processing
+            if progress_callback and not splade_success and splade_error:
+                progress_callback(f"SPLADE index: FAILED - {splade_error}")
+
    except Exception as e:
        # Cleanup on error to prevent process hanging
        try:
            _cleanup_fastembed_resources()
+            _cleanup_splade_resources()
            gc.collect()
        except Exception:
            pass
@@ -690,6 +785,7 @@ def generate_embeddings(
    # This is critical - without it, ONNX Runtime threads prevent Python from exiting
    try:
        _cleanup_fastembed_resources()
+        _cleanup_splade_resources()
        gc.collect()
    except Exception:
        pass
@@ -874,6 +970,7 @@ def generate_embeddings_recursive(
    # Each generate_embeddings() call does its own cleanup, but do a final one to be safe
    try:
        _cleanup_fastembed_resources()
+        _cleanup_splade_resources()
        gc.collect()
    except Exception:
        pass