feat(codexlens): add CodexLens code indexing platform with incremental updates

- Add CodexLens Python package with SQLite FTS5 search and tree-sitter parsing - Implement workspace-local index storage (.codexlens/ directory) - Add incremental update CLI command for efficient file-level index refresh - Integrate CodexLens with CCW tools (codex_lens action: update) - Add CodexLens Auto-Sync hook template for automatic index updates on file changes - Add CodexLens status card in CCW Dashboard CLI Manager with install/init buttons - Add server APIs: /api/codexlens/status, /api/codexlens/bootstrap, /api/codexlens/init 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-10 02:24:35 +08:00 · 2025-12-12 15:02:32 +08:00
parent b74a90b416
commit a393601ec5
31 changed files with 2718 additions and 27 deletions
--- a/codex-lens/src/codexlens/semantic/init.py
+++ b/codex-lens/src/codexlens/semantic/init.py
@@ -0,0 +1,31 @@
+"""Optional semantic search module for CodexLens.
+
+Install with: pip install codexlens[semantic]
+"""
+
+from __future__ import annotations
+
+SEMANTIC_AVAILABLE = False
+_import_error: str | None = None
+
+try:
+    import numpy as np
+    try:
+        from fastembed import TextEmbedding
+        SEMANTIC_BACKEND = "fastembed"
+    except ImportError:
+        try:
+            from sentence_transformers import SentenceTransformer
+            SEMANTIC_BACKEND = "sentence-transformers"
+        except ImportError:
+            raise ImportError("Neither fastembed nor sentence-transformers available")
+    SEMANTIC_AVAILABLE = True
+except ImportError as e:
+    _import_error = str(e)
+    SEMANTIC_BACKEND = None
+
+def check_semantic_available() -> tuple[bool, str | None]:
+    """Check if semantic search dependencies are available."""
+    return SEMANTIC_AVAILABLE, _import_error
+
+__all__ = ["SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available"]
--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -0,0 +1,130 @@
+"""Code chunking strategies for semantic search."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+from codexlens.entities import SemanticChunk, Symbol
+
+
+@dataclass
+class ChunkConfig:
+    """Configuration for chunking strategies."""
+    max_chunk_size: int = 1000  # Max characters per chunk
+    overlap: int = 100  # Overlap for sliding window
+    min_chunk_size: int = 50  # Minimum chunk size
+
+
+class Chunker:
+    """Chunk code files for semantic embedding."""
+
+    def __init__(self, config: ChunkConfig | None = None) -> None:
+        self.config = config or ChunkConfig()
+
+    def chunk_by_symbol(
+        self,
+        content: str,
+        symbols: List[Symbol],
+        file_path: str | Path,
+        language: str,
+    ) -> List[SemanticChunk]:
+        """Chunk code by extracted symbols (functions, classes).
+
+        Each symbol becomes one chunk with its full content.
+        """
+        chunks: List[SemanticChunk] = []
+        lines = content.splitlines(keepends=True)
+
+        for symbol in symbols:
+            start_line, end_line = symbol.range
+            # Convert to 0-indexed
+            start_idx = max(0, start_line - 1)
+            end_idx = min(len(lines), end_line)
+
+            chunk_content = "".join(lines[start_idx:end_idx])
+            if len(chunk_content.strip()) < self.config.min_chunk_size:
+                continue
+
+            chunks.append(SemanticChunk(
+                content=chunk_content,
+                embedding=None,
+                metadata={
+                    "file": str(file_path),
+                    "language": language,
+                    "symbol_name": symbol.name,
+                    "symbol_kind": symbol.kind,
+                    "start_line": start_line,
+                    "end_line": end_line,
+                    "strategy": "symbol",
+                }
+            ))
+
+        return chunks
+
+    def chunk_sliding_window(
+        self,
+        content: str,
+        file_path: str | Path,
+        language: str,
+    ) -> List[SemanticChunk]:
+        """Chunk code using sliding window approach.
+
+        Used for files without clear symbol boundaries or very long functions.
+        """
+        chunks: List[SemanticChunk] = []
+        lines = content.splitlines(keepends=True)
+
+        if not lines:
+            return chunks
+
+        # Calculate lines per chunk based on average line length
+        avg_line_len = len(content) / max(len(lines), 1)
+        lines_per_chunk = max(10, int(self.config.max_chunk_size / max(avg_line_len, 1)))
+        overlap_lines = max(2, int(self.config.overlap / max(avg_line_len, 1)))
+
+        start = 0
+        chunk_idx = 0
+
+        while start < len(lines):
+            end = min(start + lines_per_chunk, len(lines))
+            chunk_content = "".join(lines[start:end])
+
+            if len(chunk_content.strip()) >= self.config.min_chunk_size:
+                chunks.append(SemanticChunk(
+                    content=chunk_content,
+                    embedding=None,
+                    metadata={
+                        "file": str(file_path),
+                        "language": language,
+                        "chunk_index": chunk_idx,
+                        "start_line": start + 1,
+                        "end_line": end,
+                        "strategy": "sliding_window",
+                    }
+                ))
+                chunk_idx += 1
+
+            # Move window, accounting for overlap
+            start = end - overlap_lines
+            if start >= len(lines) - overlap_lines:
+                break
+
+        return chunks
+
+    def chunk_file(
+        self,
+        content: str,
+        symbols: List[Symbol],
+        file_path: str | Path,
+        language: str,
+    ) -> List[SemanticChunk]:
+        """Chunk a file using the best strategy.
+
+        Uses symbol-based chunking if symbols available,
+        falls back to sliding window for files without symbols.
+        """
+        if symbols:
+            return self.chunk_by_symbol(content, symbols, file_path, language)
+        return self.chunk_sliding_window(content, file_path, language)
--- a/codex-lens/src/codexlens/semantic/embedder.py
+++ b/codex-lens/src/codexlens/semantic/embedder.py
@@ -0,0 +1,67 @@
+"""Embedder for semantic code search."""
+
+from __future__ import annotations
+
+from typing import Iterable, List
+
+from . import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
+
+if SEMANTIC_AVAILABLE:
+    import numpy as np
+
+
+class Embedder:
+    """Generate embeddings for code chunks using fastembed or sentence-transformers."""
+
+    MODEL_NAME = "BAAI/bge-small-en-v1.5"
+    EMBEDDING_DIM = 384
+
+    def __init__(self, model_name: str | None = None) -> None:
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        self.model_name = model_name or self.MODEL_NAME
+        self._model = None
+        self._backend = SEMANTIC_BACKEND
+
+    def _load_model(self) -> None:
+        """Lazy load the embedding model."""
+        if self._model is not None:
+            return
+
+        if self._backend == "fastembed":
+            from fastembed import TextEmbedding
+            self._model = TextEmbedding(model_name=self.model_name)
+        else:
+            from sentence_transformers import SentenceTransformer
+            self._model = SentenceTransformer(self.model_name)
+
+    def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
+        """Generate embeddings for one or more texts.
+
+        Args:
+            texts: Single text or iterable of texts to embed.
+
+        Returns:
+            List of embedding vectors (each is a list of floats).
+        """
+        self._load_model()
+
+        if isinstance(texts, str):
+            texts = [texts]
+        else:
+            texts = list(texts)
+
+        if self._backend == "fastembed":
+            embeddings = list(self._model.embed(texts))
+            return [emb.tolist() for emb in embeddings]
+        else:
+            embeddings = self._model.encode(texts)
+            return embeddings.tolist()
+
+    def embed_single(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        return self.embed(text)[0]
--- a/codex-lens/src/codexlens/semantic/vector_store.py
+++ b/codex-lens/src/codexlens/semantic/vector_store.py
@@ -0,0 +1,166 @@
+"""Vector storage and similarity search for semantic chunks."""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from codexlens.entities import SearchResult, SemanticChunk
+from codexlens.errors import StorageError
+
+from . import SEMANTIC_AVAILABLE
+
+if SEMANTIC_AVAILABLE:
+    import numpy as np
+
+
+def _cosine_similarity(a: List[float], b: List[float]) -> float:
+    """Compute cosine similarity between two vectors."""
+    if not SEMANTIC_AVAILABLE:
+        raise ImportError("numpy required for vector operations")
+
+    a_arr = np.array(a)
+    b_arr = np.array(b)
+
+    norm_a = np.linalg.norm(a_arr)
+    norm_b = np.linalg.norm(b_arr)
+
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+
+    return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
+
+
+class VectorStore:
+    """SQLite-based vector storage with cosine similarity search."""
+
+    def __init__(self, db_path: str | Path) -> None:
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_schema()
+
+    def _init_schema(self) -> None:
+        """Initialize vector storage schema."""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS semantic_chunks (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_path TEXT NOT NULL,
+                    content TEXT NOT NULL,
+                    embedding BLOB NOT NULL,
+                    metadata TEXT,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                )
+            """)
+            conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_chunks_file
+                ON semantic_chunks(file_path)
+            """)
+            conn.commit()
+
+    def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
+        """Add a single chunk with its embedding.
+
+        Returns:
+            The inserted chunk ID.
+        """
+        if chunk.embedding is None:
+            raise ValueError("Chunk must have embedding before adding to store")
+
+        embedding_blob = np.array(chunk.embedding, dtype=np.float32).tobytes()
+        metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
+
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                """
+                INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
+                VALUES (?, ?, ?, ?)
+                """,
+                (file_path, chunk.content, embedding_blob, metadata_json)
+            )
+            conn.commit()
+            return cursor.lastrowid or 0
+
+    def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
+        """Add multiple chunks with embeddings.
+
+        Returns:
+            List of inserted chunk IDs.
+        """
+        ids = []
+        for chunk in chunks:
+            ids.append(self.add_chunk(chunk, file_path))
+        return ids
+
+    def delete_file_chunks(self, file_path: str) -> int:
+        """Delete all chunks for a file.
+
+        Returns:
+            Number of deleted chunks.
+        """
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                "DELETE FROM semantic_chunks WHERE file_path = ?",
+                (file_path,)
+            )
+            conn.commit()
+            return cursor.rowcount
+
+    def search_similar(
+        self,
+        query_embedding: List[float],
+        top_k: int = 10,
+        min_score: float = 0.0,
+    ) -> List[SearchResult]:
+        """Find chunks most similar to query embedding.
+
+        Args:
+            query_embedding: Query vector.
+            top_k: Maximum results to return.
+            min_score: Minimum similarity score (0-1).
+
+        Returns:
+            List of SearchResult ordered by similarity (highest first).
+        """
+        results: List[Tuple[float, SearchResult]] = []
+
+        with sqlite3.connect(self.db_path) as conn:
+            rows = conn.execute(
+                "SELECT id, file_path, content, embedding, metadata FROM semantic_chunks"
+            ).fetchall()
+
+        for row_id, file_path, content, embedding_blob, metadata_json in rows:
+            stored_embedding = np.frombuffer(embedding_blob, dtype=np.float32).tolist()
+            score = _cosine_similarity(query_embedding, stored_embedding)
+
+            if score >= min_score:
+                metadata = json.loads(metadata_json) if metadata_json else {}
+
+                # Build excerpt
+                excerpt = content[:200] + "..." if len(content) > 200 else content
+
+                results.append((score, SearchResult(
+                    path=file_path,
+                    score=score,
+                    excerpt=excerpt,
+                    symbol=None,
+                )))
+
+        # Sort by score descending
+        results.sort(key=lambda x: x[0], reverse=True)
+
+        return [r for _, r in results[:top_k]]
+
+    def count_chunks(self) -> int:
+        """Count total chunks in store."""
+        with sqlite3.connect(self.db_path) as conn:
+            row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
+            return row[0] if row else 0