Add comprehensive tests for semantic chunking and search functionality

- Implemented tests for the ChunkConfig and Chunker classes, covering default and custom configurations. - Added tests for symbol-based chunking, including single and multiple symbols, handling of empty symbols, and preservation of line numbers. - Developed tests for sliding window chunking, ensuring correct chunking behavior with various content sizes and configurations. - Created integration tests for semantic search, validating embedding generation, vector storage, and search accuracy across a complex codebase. - Included performance tests for embedding generation and search operations. - Established tests for chunking strategies, comparing symbol-based and sliding window approaches. - Enhanced test coverage for edge cases, including handling of unicode characters and out-of-bounds symbol ranges.
2026-02-10 02:24:35 +08:00 · 2025-12-12 19:55:35 +08:00
parent c42f91a7fe
commit 4faa5f1c95
27 changed files with 4812 additions and 129 deletions
--- a/codex-lens/src/codexlens/entities.py
+++ b/codex-lens/src/codexlens/entities.py
@@ -67,7 +67,14 @@ class SearchResult(BaseModel):
    path: str = Field(..., min_length=1)
    score: float = Field(..., ge=0.0)
    excerpt: Optional[str] = None
+    content: Optional[str] = Field(default=None, description="Full content of matched code block")
    symbol: Optional[Symbol] = None
    chunk: Optional[SemanticChunk] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
+    
+    # Additional context for complete code blocks
+    start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
+    end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
+    symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
+    symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")

--- a/codex-lens/src/codexlens/semantic/init.py
+++ b/codex-lens/src/codexlens/semantic/init.py
@@ -1,28 +1,32 @@
 """Optional semantic search module for CodexLens.

 Install with: pip install codexlens[semantic]
+Uses fastembed (ONNX-based, lightweight ~200MB)
 """

 from __future__ import annotations

 SEMANTIC_AVAILABLE = False
+SEMANTIC_BACKEND: str | None = None
 _import_error: str | None = None

-try:
-    import numpy as np
+def _detect_backend() -> tuple[bool, str | None, str | None]:
+    """Detect if fastembed is available."""
+    try:
+        import numpy as np
+    except ImportError as e:
+        return False, None, f"numpy not available: {e}"
+
    try:
        from fastembed import TextEmbedding
-        SEMANTIC_BACKEND = "fastembed"
+        return True, "fastembed", None
    except ImportError:
-        try:
-            from sentence_transformers import SentenceTransformer
-            SEMANTIC_BACKEND = "sentence-transformers"
-        except ImportError:
-            raise ImportError("Neither fastembed nor sentence-transformers available")
-    SEMANTIC_AVAILABLE = True
-except ImportError as e:
-    _import_error = str(e)
-    SEMANTIC_BACKEND = None
+        pass
+
+    return False, None, "fastembed not available. Install with: pip install codexlens[semantic]"
+
+# Initialize on module load
+SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, _import_error = _detect_backend()

 def check_semantic_available() -> tuple[bool, str | None]:
    """Check if semantic search dependencies are available."""
--- a/codex-lens/src/codexlens/semantic/code_extractor.py
+++ b/codex-lens/src/codexlens/semantic/code_extractor.py
@@ -0,0 +1,274 @@
+"""Smart code extraction for complete code blocks."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codexlens.entities import SearchResult, Symbol
+
+
+def extract_complete_code_block(
+    result: SearchResult,
+    source_file_path: Optional[str] = None,
+    context_lines: int = 0,
+) -> str:
+    """Extract complete code block from a search result.
+    
+    Args:
+        result: SearchResult from semantic search.
+        source_file_path: Optional path to source file for re-reading.
+        context_lines: Additional lines of context to include above/below.
+    
+    Returns:
+        Complete code block as string.
+    """
+    # If we have full content stored, use it
+    if result.content:
+        if context_lines == 0:
+            return result.content
+        # Need to add context, read from file
+        
+    # Try to read from source file
+    file_path = source_file_path or result.path
+    if not file_path or not Path(file_path).exists():
+        # Fall back to excerpt
+        return result.excerpt or ""
+    
+    try:
+        content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
+        lines = content.splitlines()
+        
+        # Get line range
+        start_line = result.start_line or 1
+        end_line = result.end_line or len(lines)
+        
+        # Add context
+        start_idx = max(0, start_line - 1 - context_lines)
+        end_idx = min(len(lines), end_line + context_lines)
+        
+        return "\n".join(lines[start_idx:end_idx])
+    except Exception:
+        return result.excerpt or result.content or ""
+
+
+def extract_symbol_with_context(
+    file_path: str,
+    symbol: Symbol,
+    include_docstring: bool = True,
+    include_decorators: bool = True,
+) -> str:
+    """Extract a symbol (function/class) with its docstring and decorators.
+    
+    Args:
+        file_path: Path to source file.
+        symbol: Symbol to extract.
+        include_docstring: Include docstring if present.
+        include_decorators: Include decorators/annotations above symbol.
+    
+    Returns:
+        Complete symbol code with context.
+    """
+    try:
+        content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
+        lines = content.splitlines()
+        
+        start_line, end_line = symbol.range
+        start_idx = start_line - 1
+        end_idx = end_line
+        
+        # Look for decorators above the symbol
+        if include_decorators and start_idx > 0:
+            decorator_start = start_idx
+            # Search backwards for decorators
+            i = start_idx - 1
+            while i >= 0 and i >= start_idx - 20:  # Look up to 20 lines back
+                line = lines[i].strip()
+                if line.startswith("@"):
+                    decorator_start = i
+                    i -= 1
+                elif line == "" or line.startswith("#"):
+                    # Skip empty lines and comments, continue looking
+                    i -= 1
+                elif line.startswith("//") or line.startswith("/*") or line.startswith("*"):
+                    # JavaScript/Java style comments
+                    decorator_start = i
+                    i -= 1
+                else:
+                    # Found non-decorator, non-comment line, stop
+                    break
+            start_idx = decorator_start
+        
+        return "\n".join(lines[start_idx:end_idx])
+    except Exception:
+        return ""
+
+
+def format_search_result_code(
+    result: SearchResult,
+    max_lines: Optional[int] = None,
+    show_line_numbers: bool = True,
+    highlight_match: bool = False,
+) -> str:
+    """Format search result code for display.
+    
+    Args:
+        result: SearchResult to format.
+        max_lines: Maximum lines to show (None for all).
+        show_line_numbers: Include line numbers in output.
+        highlight_match: Add markers for matched region.
+    
+    Returns:
+        Formatted code string.
+    """
+    content = result.content or result.excerpt or ""
+    if not content:
+        return ""
+    
+    lines = content.splitlines()
+    
+    # Truncate if needed
+    truncated = False
+    if max_lines and len(lines) > max_lines:
+        lines = lines[:max_lines]
+        truncated = True
+    
+    # Format with line numbers
+    if show_line_numbers:
+        start = result.start_line or 1
+        formatted_lines = []
+        for i, line in enumerate(lines):
+            line_num = start + i
+            formatted_lines.append(f"{line_num:4d} | {line}")
+        output = "\n".join(formatted_lines)
+    else:
+        output = "\n".join(lines)
+    
+    if truncated:
+        output += "\n... (truncated)"
+    
+    return output
+
+
+def get_code_block_summary(result: SearchResult) -> str:
+    """Get a concise summary of a code block.
+    
+    Args:
+        result: SearchResult to summarize.
+    
+    Returns:
+        Summary string like "function hello_world (lines 10-25)"
+    """
+    parts = []
+    
+    if result.symbol_kind:
+        parts.append(result.symbol_kind)
+    
+    if result.symbol_name:
+        parts.append(f"`{result.symbol_name}`")
+    elif result.excerpt:
+        # Extract first meaningful identifier
+        first_line = result.excerpt.split("\n")[0][:50]
+        parts.append(f'"{first_line}..."')
+    
+    if result.start_line and result.end_line:
+        if result.start_line == result.end_line:
+            parts.append(f"(line {result.start_line})")
+        else:
+            parts.append(f"(lines {result.start_line}-{result.end_line})")
+    
+    if result.path:
+        file_name = Path(result.path).name
+        parts.append(f"in {file_name}")
+    
+    return " ".join(parts) if parts else "unknown code block"
+
+
+class CodeBlockResult:
+    """Enhanced search result with complete code block."""
+    
+    def __init__(self, result: SearchResult, source_path: Optional[str] = None):
+        self.result = result
+        self.source_path = source_path or result.path
+        self._full_code: Optional[str] = None
+    
+    @property
+    def score(self) -> float:
+        return self.result.score
+    
+    @property
+    def path(self) -> str:
+        return self.result.path
+    
+    @property
+    def file_name(self) -> str:
+        return Path(self.result.path).name
+    
+    @property
+    def symbol_name(self) -> Optional[str]:
+        return self.result.symbol_name
+    
+    @property
+    def symbol_kind(self) -> Optional[str]:
+        return self.result.symbol_kind
+    
+    @property
+    def line_range(self) -> Tuple[int, int]:
+        return (
+            self.result.start_line or 1,
+            self.result.end_line or 1
+        )
+    
+    @property
+    def full_code(self) -> str:
+        """Get full code block content."""
+        if self._full_code is None:
+            self._full_code = extract_complete_code_block(self.result, self.source_path)
+        return self._full_code
+    
+    @property
+    def excerpt(self) -> str:
+        """Get short excerpt."""
+        return self.result.excerpt or ""
+    
+    @property
+    def summary(self) -> str:
+        """Get code block summary."""
+        return get_code_block_summary(self.result)
+    
+    def format(
+        self,
+        max_lines: Optional[int] = None,
+        show_line_numbers: bool = True,
+    ) -> str:
+        """Format code for display."""
+        # Use full code if available
+        display_result = SearchResult(
+            path=self.result.path,
+            score=self.result.score,
+            content=self.full_code,
+            start_line=self.result.start_line,
+            end_line=self.result.end_line,
+        )
+        return format_search_result_code(
+            display_result,
+            max_lines=max_lines,
+            show_line_numbers=show_line_numbers
+        )
+    
+    def __repr__(self) -> str:
+        return f"<CodeBlockResult {self.summary} score={self.score:.3f}>"
+
+
+def enhance_search_results(
+    results: List[SearchResult],
+) -> List[CodeBlockResult]:
+    """Enhance search results with complete code block access.
+    
+    Args:
+        results: List of SearchResult from semantic search.
+    
+    Returns:
+        List of CodeBlockResult with full code access.
+    """
+    return [CodeBlockResult(r) for r in results]
--- a/codex-lens/src/codexlens/semantic/embedder.py
+++ b/codex-lens/src/codexlens/semantic/embedder.py
@@ -1,17 +1,14 @@
-"""Embedder for semantic code search."""
+"""Embedder for semantic code search using fastembed."""

 from __future__ import annotations

 from typing import Iterable, List

-from . import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
-
-if SEMANTIC_AVAILABLE:
-    import numpy as np
+from . import SEMANTIC_AVAILABLE


 class Embedder:
-    """Generate embeddings for code chunks using fastembed or sentence-transformers."""
+    """Generate embeddings for code chunks using fastembed (ONNX-based)."""

    MODEL_NAME = "BAAI/bge-small-en-v1.5"
    EMBEDDING_DIM = 384
@@ -25,19 +22,14 @@ class Embedder:

        self.model_name = model_name or self.MODEL_NAME
        self._model = None
-        self._backend = SEMANTIC_BACKEND

    def _load_model(self) -> None:
        """Lazy load the embedding model."""
        if self._model is not None:
            return

-        if self._backend == "fastembed":
-            from fastembed import TextEmbedding
-            self._model = TextEmbedding(model_name=self.model_name)
-        else:
-            from sentence_transformers import SentenceTransformer
-            self._model = SentenceTransformer(self.model_name)
+        from fastembed import TextEmbedding
+        self._model = TextEmbedding(model_name=self.model_name)

    def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
        """Generate embeddings for one or more texts.
@@ -55,12 +47,8 @@ class Embedder:
        else:
            texts = list(texts)

-        if self._backend == "fastembed":
-            embeddings = list(self._model.embed(texts))
-            return [emb.tolist() for emb in embeddings]
-        else:
-            embeddings = self._model.encode(texts)
-            return embeddings.tolist()
+        embeddings = list(self._model.embed(texts))
+        return [emb.tolist() for emb in embeddings]

    def embed_single(self, text: str) -> List[float]:
        """Generate embedding for a single text."""
--- a/codex-lens/src/codexlens/semantic/vector_store.py
+++ b/codex-lens/src/codexlens/semantic/vector_store.py
@@ -119,6 +119,7 @@ class VectorStore:
        query_embedding: List[float],
        top_k: int = 10,
        min_score: float = 0.0,
+        return_full_content: bool = True,
    ) -> List[SearchResult]:
        """Find chunks most similar to query embedding.

@@ -126,6 +127,7 @@ class VectorStore:
            query_embedding: Query vector.
            top_k: Maximum results to return.
            min_score: Minimum similarity score (0-1).
+            return_full_content: If True, return full code block content.

        Returns:
            List of SearchResult ordered by similarity (highest first).
@@ -144,14 +146,39 @@ class VectorStore:
            if score >= min_score:
                metadata = json.loads(metadata_json) if metadata_json else {}

-                # Build excerpt
+                # Build excerpt (short preview)
                excerpt = content[:200] + "..." if len(content) > 200 else content
+                
+                # Extract symbol information from metadata
+                symbol_name = metadata.get("symbol_name")
+                symbol_kind = metadata.get("symbol_kind")
+                start_line = metadata.get("start_line")
+                end_line = metadata.get("end_line")
+                
+                # Build Symbol object if we have symbol info
+                symbol = None
+                if symbol_name and symbol_kind and start_line and end_line:
+                    try:
+                        from codexlens.entities import Symbol
+                        symbol = Symbol(
+                            name=symbol_name,
+                            kind=symbol_kind,
+                            range=(start_line, end_line)
+                        )
+                    except Exception:
+                        pass

                results.append((score, SearchResult(
                    path=file_path,
                    score=score,
                    excerpt=excerpt,
-                    symbol=None,
+                    content=content if return_full_content else None,
+                    symbol=symbol,
+                    metadata=metadata,
+                    start_line=start_line,
+                    end_line=end_line,
+                    symbol_name=symbol_name,
+                    symbol_kind=symbol_kind,
                )))

        # Sort by score descending