Add comprehensive tests for semantic chunking and search functionality

- Implemented tests for the ChunkConfig and Chunker classes, covering default and custom configurations.
- Added tests for symbol-based chunking, including single and multiple symbols, handling of empty symbols, and preservation of line numbers.
- Developed tests for sliding window chunking, ensuring correct chunking behavior with various content sizes and configurations.
- Created integration tests for semantic search, validating embedding generation, vector storage, and search accuracy across a complex codebase.
- Included performance tests for embedding generation and search operations.
- Established tests for chunking strategies, comparing symbol-based and sliding window approaches.
- Enhanced test coverage for edge cases, including handling of unicode characters and out-of-bounds symbol ranges.
This commit is contained in:
catlog22
2025-12-12 19:55:35 +08:00
parent c42f91a7fe
commit 4faa5f1c95
27 changed files with 4812 additions and 129 deletions

View File

@@ -67,7 +67,14 @@ class SearchResult(BaseModel):
path: str = Field(..., min_length=1)
score: float = Field(..., ge=0.0)
excerpt: Optional[str] = None
content: Optional[str] = Field(default=None, description="Full content of matched code block")
symbol: Optional[Symbol] = None
chunk: Optional[SemanticChunk] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
# Additional context for complete code blocks
start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")

View File

@@ -1,28 +1,32 @@
"""Optional semantic search module for CodexLens.
Install with: pip install codexlens[semantic]
Uses fastembed (ONNX-based, lightweight ~200MB)
"""
from __future__ import annotations
SEMANTIC_AVAILABLE = False
SEMANTIC_BACKEND: str | None = None
_import_error: str | None = None
try:
import numpy as np
def _detect_backend() -> tuple[bool, str | None, str | None]:
"""Detect if fastembed is available."""
try:
import numpy as np
except ImportError as e:
return False, None, f"numpy not available: {e}"
try:
from fastembed import TextEmbedding
SEMANTIC_BACKEND = "fastembed"
return True, "fastembed", None
except ImportError:
try:
from sentence_transformers import SentenceTransformer
SEMANTIC_BACKEND = "sentence-transformers"
except ImportError:
raise ImportError("Neither fastembed nor sentence-transformers available")
SEMANTIC_AVAILABLE = True
except ImportError as e:
_import_error = str(e)
SEMANTIC_BACKEND = None
pass
return False, None, "fastembed not available. Install with: pip install codexlens[semantic]"
# Initialize on module load
SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, _import_error = _detect_backend()
def check_semantic_available() -> tuple[bool, str | None]:
"""Check if semantic search dependencies are available."""

View File

@@ -0,0 +1,274 @@
"""Smart code extraction for complete code blocks."""
from __future__ import annotations
from pathlib import Path
from typing import List, Optional, Tuple
from codexlens.entities import SearchResult, Symbol
def extract_complete_code_block(
result: SearchResult,
source_file_path: Optional[str] = None,
context_lines: int = 0,
) -> str:
"""Extract complete code block from a search result.
Args:
result: SearchResult from semantic search.
source_file_path: Optional path to source file for re-reading.
context_lines: Additional lines of context to include above/below.
Returns:
Complete code block as string.
"""
# If we have full content stored, use it
if result.content:
if context_lines == 0:
return result.content
# Need to add context, read from file
# Try to read from source file
file_path = source_file_path or result.path
if not file_path or not Path(file_path).exists():
# Fall back to excerpt
return result.excerpt or ""
try:
content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
lines = content.splitlines()
# Get line range
start_line = result.start_line or 1
end_line = result.end_line or len(lines)
# Add context
start_idx = max(0, start_line - 1 - context_lines)
end_idx = min(len(lines), end_line + context_lines)
return "\n".join(lines[start_idx:end_idx])
except Exception:
return result.excerpt or result.content or ""
def extract_symbol_with_context(
file_path: str,
symbol: Symbol,
include_docstring: bool = True,
include_decorators: bool = True,
) -> str:
"""Extract a symbol (function/class) with its docstring and decorators.
Args:
file_path: Path to source file.
symbol: Symbol to extract.
include_docstring: Include docstring if present.
include_decorators: Include decorators/annotations above symbol.
Returns:
Complete symbol code with context.
"""
try:
content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
lines = content.splitlines()
start_line, end_line = symbol.range
start_idx = start_line - 1
end_idx = end_line
# Look for decorators above the symbol
if include_decorators and start_idx > 0:
decorator_start = start_idx
# Search backwards for decorators
i = start_idx - 1
while i >= 0 and i >= start_idx - 20: # Look up to 20 lines back
line = lines[i].strip()
if line.startswith("@"):
decorator_start = i
i -= 1
elif line == "" or line.startswith("#"):
# Skip empty lines and comments, continue looking
i -= 1
elif line.startswith("//") or line.startswith("/*") or line.startswith("*"):
# JavaScript/Java style comments
decorator_start = i
i -= 1
else:
# Found non-decorator, non-comment line, stop
break
start_idx = decorator_start
return "\n".join(lines[start_idx:end_idx])
except Exception:
return ""
def format_search_result_code(
result: SearchResult,
max_lines: Optional[int] = None,
show_line_numbers: bool = True,
highlight_match: bool = False,
) -> str:
"""Format search result code for display.
Args:
result: SearchResult to format.
max_lines: Maximum lines to show (None for all).
show_line_numbers: Include line numbers in output.
highlight_match: Add markers for matched region.
Returns:
Formatted code string.
"""
content = result.content or result.excerpt or ""
if not content:
return ""
lines = content.splitlines()
# Truncate if needed
truncated = False
if max_lines and len(lines) > max_lines:
lines = lines[:max_lines]
truncated = True
# Format with line numbers
if show_line_numbers:
start = result.start_line or 1
formatted_lines = []
for i, line in enumerate(lines):
line_num = start + i
formatted_lines.append(f"{line_num:4d} | {line}")
output = "\n".join(formatted_lines)
else:
output = "\n".join(lines)
if truncated:
output += "\n... (truncated)"
return output
def get_code_block_summary(result: SearchResult) -> str:
"""Get a concise summary of a code block.
Args:
result: SearchResult to summarize.
Returns:
Summary string like "function hello_world (lines 10-25)"
"""
parts = []
if result.symbol_kind:
parts.append(result.symbol_kind)
if result.symbol_name:
parts.append(f"`{result.symbol_name}`")
elif result.excerpt:
# Extract first meaningful identifier
first_line = result.excerpt.split("\n")[0][:50]
parts.append(f'"{first_line}..."')
if result.start_line and result.end_line:
if result.start_line == result.end_line:
parts.append(f"(line {result.start_line})")
else:
parts.append(f"(lines {result.start_line}-{result.end_line})")
if result.path:
file_name = Path(result.path).name
parts.append(f"in {file_name}")
return " ".join(parts) if parts else "unknown code block"
class CodeBlockResult:
"""Enhanced search result with complete code block."""
def __init__(self, result: SearchResult, source_path: Optional[str] = None):
self.result = result
self.source_path = source_path or result.path
self._full_code: Optional[str] = None
@property
def score(self) -> float:
return self.result.score
@property
def path(self) -> str:
return self.result.path
@property
def file_name(self) -> str:
return Path(self.result.path).name
@property
def symbol_name(self) -> Optional[str]:
return self.result.symbol_name
@property
def symbol_kind(self) -> Optional[str]:
return self.result.symbol_kind
@property
def line_range(self) -> Tuple[int, int]:
return (
self.result.start_line or 1,
self.result.end_line or 1
)
@property
def full_code(self) -> str:
"""Get full code block content."""
if self._full_code is None:
self._full_code = extract_complete_code_block(self.result, self.source_path)
return self._full_code
@property
def excerpt(self) -> str:
"""Get short excerpt."""
return self.result.excerpt or ""
@property
def summary(self) -> str:
"""Get code block summary."""
return get_code_block_summary(self.result)
def format(
self,
max_lines: Optional[int] = None,
show_line_numbers: bool = True,
) -> str:
"""Format code for display."""
# Use full code if available
display_result = SearchResult(
path=self.result.path,
score=self.result.score,
content=self.full_code,
start_line=self.result.start_line,
end_line=self.result.end_line,
)
return format_search_result_code(
display_result,
max_lines=max_lines,
show_line_numbers=show_line_numbers
)
def __repr__(self) -> str:
return f"<CodeBlockResult {self.summary} score={self.score:.3f}>"
def enhance_search_results(
results: List[SearchResult],
) -> List[CodeBlockResult]:
"""Enhance search results with complete code block access.
Args:
results: List of SearchResult from semantic search.
Returns:
List of CodeBlockResult with full code access.
"""
return [CodeBlockResult(r) for r in results]

View File

@@ -1,17 +1,14 @@
"""Embedder for semantic code search."""
"""Embedder for semantic code search using fastembed."""
from __future__ import annotations
from typing import Iterable, List
from . import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
if SEMANTIC_AVAILABLE:
import numpy as np
from . import SEMANTIC_AVAILABLE
class Embedder:
"""Generate embeddings for code chunks using fastembed or sentence-transformers."""
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
MODEL_NAME = "BAAI/bge-small-en-v1.5"
EMBEDDING_DIM = 384
@@ -25,19 +22,14 @@ class Embedder:
self.model_name = model_name or self.MODEL_NAME
self._model = None
self._backend = SEMANTIC_BACKEND
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:
return
if self._backend == "fastembed":
from fastembed import TextEmbedding
self._model = TextEmbedding(model_name=self.model_name)
else:
from sentence_transformers import SentenceTransformer
self._model = SentenceTransformer(self.model_name)
from fastembed import TextEmbedding
self._model = TextEmbedding(model_name=self.model_name)
def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
"""Generate embeddings for one or more texts.
@@ -55,12 +47,8 @@ class Embedder:
else:
texts = list(texts)
if self._backend == "fastembed":
embeddings = list(self._model.embed(texts))
return [emb.tolist() for emb in embeddings]
else:
embeddings = self._model.encode(texts)
return embeddings.tolist()
embeddings = list(self._model.embed(texts))
return [emb.tolist() for emb in embeddings]
def embed_single(self, text: str) -> List[float]:
"""Generate embedding for a single text."""

View File

@@ -119,6 +119,7 @@ class VectorStore:
query_embedding: List[float],
top_k: int = 10,
min_score: float = 0.0,
return_full_content: bool = True,
) -> List[SearchResult]:
"""Find chunks most similar to query embedding.
@@ -126,6 +127,7 @@ class VectorStore:
query_embedding: Query vector.
top_k: Maximum results to return.
min_score: Minimum similarity score (0-1).
return_full_content: If True, return full code block content.
Returns:
List of SearchResult ordered by similarity (highest first).
@@ -144,14 +146,39 @@ class VectorStore:
if score >= min_score:
metadata = json.loads(metadata_json) if metadata_json else {}
# Build excerpt
# Build excerpt (short preview)
excerpt = content[:200] + "..." if len(content) > 200 else content
# Extract symbol information from metadata
symbol_name = metadata.get("symbol_name")
symbol_kind = metadata.get("symbol_kind")
start_line = metadata.get("start_line")
end_line = metadata.get("end_line")
# Build Symbol object if we have symbol info
symbol = None
if symbol_name and symbol_kind and start_line and end_line:
try:
from codexlens.entities import Symbol
symbol = Symbol(
name=symbol_name,
kind=symbol_kind,
range=(start_line, end_line)
)
except Exception:
pass
results.append((score, SearchResult(
path=file_path,
score=score,
excerpt=excerpt,
symbol=None,
content=content if return_full_content else None,
symbol=symbol,
metadata=metadata,
start_line=start_line,
end_line=end_line,
symbol_name=symbol_name,
symbol_kind=symbol_kind,
)))
# Sort by score descending