mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
feat(codexlens): add CodexLens code indexing platform with incremental updates
- Add CodexLens Python package with SQLite FTS5 search and tree-sitter parsing - Implement workspace-local index storage (.codexlens/ directory) - Add incremental update CLI command for efficient file-level index refresh - Integrate CodexLens with CCW tools (codex_lens action: update) - Add CodexLens Auto-Sync hook template for automatic index updates on file changes - Add CodexLens status card in CCW Dashboard CLI Manager with install/init buttons - Add server APIs: /api/codexlens/status, /api/codexlens/bootstrap, /api/codexlens/init 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
31
codex-lens/src/codexlens/semantic/__init__.py
Normal file
31
codex-lens/src/codexlens/semantic/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Optional semantic search module for CodexLens.
|
||||
|
||||
Install with: pip install codexlens[semantic]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
SEMANTIC_AVAILABLE = False
|
||||
_import_error: str | None = None
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
try:
|
||||
from fastembed import TextEmbedding
|
||||
SEMANTIC_BACKEND = "fastembed"
|
||||
except ImportError:
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
SEMANTIC_BACKEND = "sentence-transformers"
|
||||
except ImportError:
|
||||
raise ImportError("Neither fastembed nor sentence-transformers available")
|
||||
SEMANTIC_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
_import_error = str(e)
|
||||
SEMANTIC_BACKEND = None
|
||||
|
||||
def check_semantic_available() -> tuple[bool, str | None]:
|
||||
"""Check if semantic search dependencies are available."""
|
||||
return SEMANTIC_AVAILABLE, _import_error
|
||||
|
||||
__all__ = ["SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available"]
|
||||
130
codex-lens/src/codexlens/semantic/chunker.py
Normal file
130
codex-lens/src/codexlens/semantic/chunker.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""Code chunking strategies for semantic search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from codexlens.entities import SemanticChunk, Symbol
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkConfig:
|
||||
"""Configuration for chunking strategies."""
|
||||
max_chunk_size: int = 1000 # Max characters per chunk
|
||||
overlap: int = 100 # Overlap for sliding window
|
||||
min_chunk_size: int = 50 # Minimum chunk size
|
||||
|
||||
|
||||
class Chunker:
|
||||
"""Chunk code files for semantic embedding."""
|
||||
|
||||
def __init__(self, config: ChunkConfig | None = None) -> None:
|
||||
self.config = config or ChunkConfig()
|
||||
|
||||
def chunk_by_symbol(
|
||||
self,
|
||||
content: str,
|
||||
symbols: List[Symbol],
|
||||
file_path: str | Path,
|
||||
language: str,
|
||||
) -> List[SemanticChunk]:
|
||||
"""Chunk code by extracted symbols (functions, classes).
|
||||
|
||||
Each symbol becomes one chunk with its full content.
|
||||
"""
|
||||
chunks: List[SemanticChunk] = []
|
||||
lines = content.splitlines(keepends=True)
|
||||
|
||||
for symbol in symbols:
|
||||
start_line, end_line = symbol.range
|
||||
# Convert to 0-indexed
|
||||
start_idx = max(0, start_line - 1)
|
||||
end_idx = min(len(lines), end_line)
|
||||
|
||||
chunk_content = "".join(lines[start_idx:end_idx])
|
||||
if len(chunk_content.strip()) < self.config.min_chunk_size:
|
||||
continue
|
||||
|
||||
chunks.append(SemanticChunk(
|
||||
content=chunk_content,
|
||||
embedding=None,
|
||||
metadata={
|
||||
"file": str(file_path),
|
||||
"language": language,
|
||||
"symbol_name": symbol.name,
|
||||
"symbol_kind": symbol.kind,
|
||||
"start_line": start_line,
|
||||
"end_line": end_line,
|
||||
"strategy": "symbol",
|
||||
}
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_sliding_window(
|
||||
self,
|
||||
content: str,
|
||||
file_path: str | Path,
|
||||
language: str,
|
||||
) -> List[SemanticChunk]:
|
||||
"""Chunk code using sliding window approach.
|
||||
|
||||
Used for files without clear symbol boundaries or very long functions.
|
||||
"""
|
||||
chunks: List[SemanticChunk] = []
|
||||
lines = content.splitlines(keepends=True)
|
||||
|
||||
if not lines:
|
||||
return chunks
|
||||
|
||||
# Calculate lines per chunk based on average line length
|
||||
avg_line_len = len(content) / max(len(lines), 1)
|
||||
lines_per_chunk = max(10, int(self.config.max_chunk_size / max(avg_line_len, 1)))
|
||||
overlap_lines = max(2, int(self.config.overlap / max(avg_line_len, 1)))
|
||||
|
||||
start = 0
|
||||
chunk_idx = 0
|
||||
|
||||
while start < len(lines):
|
||||
end = min(start + lines_per_chunk, len(lines))
|
||||
chunk_content = "".join(lines[start:end])
|
||||
|
||||
if len(chunk_content.strip()) >= self.config.min_chunk_size:
|
||||
chunks.append(SemanticChunk(
|
||||
content=chunk_content,
|
||||
embedding=None,
|
||||
metadata={
|
||||
"file": str(file_path),
|
||||
"language": language,
|
||||
"chunk_index": chunk_idx,
|
||||
"start_line": start + 1,
|
||||
"end_line": end,
|
||||
"strategy": "sliding_window",
|
||||
}
|
||||
))
|
||||
chunk_idx += 1
|
||||
|
||||
# Move window, accounting for overlap
|
||||
start = end - overlap_lines
|
||||
if start >= len(lines) - overlap_lines:
|
||||
break
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_file(
|
||||
self,
|
||||
content: str,
|
||||
symbols: List[Symbol],
|
||||
file_path: str | Path,
|
||||
language: str,
|
||||
) -> List[SemanticChunk]:
|
||||
"""Chunk a file using the best strategy.
|
||||
|
||||
Uses symbol-based chunking if symbols available,
|
||||
falls back to sliding window for files without symbols.
|
||||
"""
|
||||
if symbols:
|
||||
return self.chunk_by_symbol(content, symbols, file_path, language)
|
||||
return self.chunk_sliding_window(content, file_path, language)
|
||||
67
codex-lens/src/codexlens/semantic/embedder.py
Normal file
67
codex-lens/src/codexlens/semantic/embedder.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Embedder for semantic code search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, List
|
||||
|
||||
from . import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
|
||||
|
||||
if SEMANTIC_AVAILABLE:
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Embedder:
|
||||
"""Generate embeddings for code chunks using fastembed or sentence-transformers."""
|
||||
|
||||
MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
||||
EMBEDDING_DIM = 384
|
||||
|
||||
def __init__(self, model_name: str | None = None) -> None:
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
self.model_name = model_name or self.MODEL_NAME
|
||||
self._model = None
|
||||
self._backend = SEMANTIC_BACKEND
|
||||
|
||||
def _load_model(self) -> None:
|
||||
"""Lazy load the embedding model."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
if self._backend == "fastembed":
|
||||
from fastembed import TextEmbedding
|
||||
self._model = TextEmbedding(model_name=self.model_name)
|
||||
else:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self._model = SentenceTransformer(self.model_name)
|
||||
|
||||
def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for one or more texts.
|
||||
|
||||
Args:
|
||||
texts: Single text or iterable of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embedding vectors (each is a list of floats).
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
else:
|
||||
texts = list(texts)
|
||||
|
||||
if self._backend == "fastembed":
|
||||
embeddings = list(self._model.embed(texts))
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
else:
|
||||
embeddings = self._model.encode(texts)
|
||||
return embeddings.tolist()
|
||||
|
||||
def embed_single(self, text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
return self.embed(text)[0]
|
||||
166
codex-lens/src/codexlens/semantic/vector_store.py
Normal file
166
codex-lens/src/codexlens/semantic/vector_store.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Vector storage and similarity search for semantic chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import SearchResult, SemanticChunk
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
from . import SEMANTIC_AVAILABLE
|
||||
|
||||
if SEMANTIC_AVAILABLE:
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
||||
"""Compute cosine similarity between two vectors."""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError("numpy required for vector operations")
|
||||
|
||||
a_arr = np.array(a)
|
||||
b_arr = np.array(b)
|
||||
|
||||
norm_a = np.linalg.norm(a_arr)
|
||||
norm_b = np.linalg.norm(b_arr)
|
||||
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
|
||||
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
|
||||
|
||||
|
||||
class VectorStore:
|
||||
"""SQLite-based vector storage with cosine similarity search."""
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._init_schema()
|
||||
|
||||
def _init_schema(self) -> None:
|
||||
"""Initialize vector storage schema."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS semantic_chunks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB NOT NULL,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file
|
||||
ON semantic_chunks(file_path)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
|
||||
"""Add a single chunk with its embedding.
|
||||
|
||||
Returns:
|
||||
The inserted chunk ID.
|
||||
"""
|
||||
if chunk.embedding is None:
|
||||
raise ValueError("Chunk must have embedding before adding to store")
|
||||
|
||||
embedding_blob = np.array(chunk.embedding, dtype=np.float32).tobytes()
|
||||
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
(file_path, chunk.content, embedding_blob, metadata_json)
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.lastrowid or 0
|
||||
|
||||
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
|
||||
"""Add multiple chunks with embeddings.
|
||||
|
||||
Returns:
|
||||
List of inserted chunk IDs.
|
||||
"""
|
||||
ids = []
|
||||
for chunk in chunks:
|
||||
ids.append(self.add_chunk(chunk, file_path))
|
||||
return ids
|
||||
|
||||
def delete_file_chunks(self, file_path: str) -> int:
|
||||
"""Delete all chunks for a file.
|
||||
|
||||
Returns:
|
||||
Number of deleted chunks.
|
||||
"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM semantic_chunks WHERE file_path = ?",
|
||||
(file_path,)
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.rowcount
|
||||
|
||||
def search_similar(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
top_k: int = 10,
|
||||
min_score: float = 0.0,
|
||||
) -> List[SearchResult]:
|
||||
"""Find chunks most similar to query embedding.
|
||||
|
||||
Args:
|
||||
query_embedding: Query vector.
|
||||
top_k: Maximum results to return.
|
||||
min_score: Minimum similarity score (0-1).
|
||||
|
||||
Returns:
|
||||
List of SearchResult ordered by similarity (highest first).
|
||||
"""
|
||||
results: List[Tuple[float, SearchResult]] = []
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT id, file_path, content, embedding, metadata FROM semantic_chunks"
|
||||
).fetchall()
|
||||
|
||||
for row_id, file_path, content, embedding_blob, metadata_json in rows:
|
||||
stored_embedding = np.frombuffer(embedding_blob, dtype=np.float32).tolist()
|
||||
score = _cosine_similarity(query_embedding, stored_embedding)
|
||||
|
||||
if score >= min_score:
|
||||
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||
|
||||
# Build excerpt
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
results.append((score, SearchResult(
|
||||
path=file_path,
|
||||
score=score,
|
||||
excerpt=excerpt,
|
||||
symbol=None,
|
||||
)))
|
||||
|
||||
# Sort by score descending
|
||||
results.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
return [r for _, r in results[:top_k]]
|
||||
|
||||
def count_chunks(self) -> int:
|
||||
"""Count total chunks in store."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
|
||||
return row[0] if row else 0
|
||||
Reference in New Issue
Block a user