mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
Add comprehensive tests for semantic chunking and search functionality
- Implemented tests for the ChunkConfig and Chunker classes, covering default and custom configurations. - Added tests for symbol-based chunking, including single and multiple symbols, handling of empty symbols, and preservation of line numbers. - Developed tests for sliding window chunking, ensuring correct chunking behavior with various content sizes and configurations. - Created integration tests for semantic search, validating embedding generation, vector storage, and search accuracy across a complex codebase. - Included performance tests for embedding generation and search operations. - Established tests for chunking strategies, comparing symbol-based and sliding window approaches. - Enhanced test coverage for edge cases, including handling of unicode characters and out-of-bounds symbol ranges.
This commit is contained in:
@@ -67,7 +67,14 @@ class SearchResult(BaseModel):
|
||||
path: str = Field(..., min_length=1)
|
||||
score: float = Field(..., ge=0.0)
|
||||
excerpt: Optional[str] = None
|
||||
content: Optional[str] = Field(default=None, description="Full content of matched code block")
|
||||
symbol: Optional[Symbol] = None
|
||||
chunk: Optional[SemanticChunk] = None
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Additional context for complete code blocks
|
||||
start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
|
||||
end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
|
||||
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
|
||||
symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")
|
||||
|
||||
|
||||
@@ -1,28 +1,32 @@
|
||||
"""Optional semantic search module for CodexLens.
|
||||
|
||||
Install with: pip install codexlens[semantic]
|
||||
Uses fastembed (ONNX-based, lightweight ~200MB)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
SEMANTIC_AVAILABLE = False
|
||||
SEMANTIC_BACKEND: str | None = None
|
||||
_import_error: str | None = None
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
def _detect_backend() -> tuple[bool, str | None, str | None]:
|
||||
"""Detect if fastembed is available."""
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as e:
|
||||
return False, None, f"numpy not available: {e}"
|
||||
|
||||
try:
|
||||
from fastembed import TextEmbedding
|
||||
SEMANTIC_BACKEND = "fastembed"
|
||||
return True, "fastembed", None
|
||||
except ImportError:
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
SEMANTIC_BACKEND = "sentence-transformers"
|
||||
except ImportError:
|
||||
raise ImportError("Neither fastembed nor sentence-transformers available")
|
||||
SEMANTIC_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
_import_error = str(e)
|
||||
SEMANTIC_BACKEND = None
|
||||
pass
|
||||
|
||||
return False, None, "fastembed not available. Install with: pip install codexlens[semantic]"
|
||||
|
||||
# Initialize on module load
|
||||
SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, _import_error = _detect_backend()
|
||||
|
||||
def check_semantic_available() -> tuple[bool, str | None]:
|
||||
"""Check if semantic search dependencies are available."""
|
||||
|
||||
274
codex-lens/src/codexlens/semantic/code_extractor.py
Normal file
274
codex-lens/src/codexlens/semantic/code_extractor.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""Smart code extraction for complete code blocks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import SearchResult, Symbol
|
||||
|
||||
|
||||
def extract_complete_code_block(
|
||||
result: SearchResult,
|
||||
source_file_path: Optional[str] = None,
|
||||
context_lines: int = 0,
|
||||
) -> str:
|
||||
"""Extract complete code block from a search result.
|
||||
|
||||
Args:
|
||||
result: SearchResult from semantic search.
|
||||
source_file_path: Optional path to source file for re-reading.
|
||||
context_lines: Additional lines of context to include above/below.
|
||||
|
||||
Returns:
|
||||
Complete code block as string.
|
||||
"""
|
||||
# If we have full content stored, use it
|
||||
if result.content:
|
||||
if context_lines == 0:
|
||||
return result.content
|
||||
# Need to add context, read from file
|
||||
|
||||
# Try to read from source file
|
||||
file_path = source_file_path or result.path
|
||||
if not file_path or not Path(file_path).exists():
|
||||
# Fall back to excerpt
|
||||
return result.excerpt or ""
|
||||
|
||||
try:
|
||||
content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
|
||||
lines = content.splitlines()
|
||||
|
||||
# Get line range
|
||||
start_line = result.start_line or 1
|
||||
end_line = result.end_line or len(lines)
|
||||
|
||||
# Add context
|
||||
start_idx = max(0, start_line - 1 - context_lines)
|
||||
end_idx = min(len(lines), end_line + context_lines)
|
||||
|
||||
return "\n".join(lines[start_idx:end_idx])
|
||||
except Exception:
|
||||
return result.excerpt or result.content or ""
|
||||
|
||||
|
||||
def extract_symbol_with_context(
|
||||
file_path: str,
|
||||
symbol: Symbol,
|
||||
include_docstring: bool = True,
|
||||
include_decorators: bool = True,
|
||||
) -> str:
|
||||
"""Extract a symbol (function/class) with its docstring and decorators.
|
||||
|
||||
Args:
|
||||
file_path: Path to source file.
|
||||
symbol: Symbol to extract.
|
||||
include_docstring: Include docstring if present.
|
||||
include_decorators: Include decorators/annotations above symbol.
|
||||
|
||||
Returns:
|
||||
Complete symbol code with context.
|
||||
"""
|
||||
try:
|
||||
content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
|
||||
lines = content.splitlines()
|
||||
|
||||
start_line, end_line = symbol.range
|
||||
start_idx = start_line - 1
|
||||
end_idx = end_line
|
||||
|
||||
# Look for decorators above the symbol
|
||||
if include_decorators and start_idx > 0:
|
||||
decorator_start = start_idx
|
||||
# Search backwards for decorators
|
||||
i = start_idx - 1
|
||||
while i >= 0 and i >= start_idx - 20: # Look up to 20 lines back
|
||||
line = lines[i].strip()
|
||||
if line.startswith("@"):
|
||||
decorator_start = i
|
||||
i -= 1
|
||||
elif line == "" or line.startswith("#"):
|
||||
# Skip empty lines and comments, continue looking
|
||||
i -= 1
|
||||
elif line.startswith("//") or line.startswith("/*") or line.startswith("*"):
|
||||
# JavaScript/Java style comments
|
||||
decorator_start = i
|
||||
i -= 1
|
||||
else:
|
||||
# Found non-decorator, non-comment line, stop
|
||||
break
|
||||
start_idx = decorator_start
|
||||
|
||||
return "\n".join(lines[start_idx:end_idx])
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def format_search_result_code(
|
||||
result: SearchResult,
|
||||
max_lines: Optional[int] = None,
|
||||
show_line_numbers: bool = True,
|
||||
highlight_match: bool = False,
|
||||
) -> str:
|
||||
"""Format search result code for display.
|
||||
|
||||
Args:
|
||||
result: SearchResult to format.
|
||||
max_lines: Maximum lines to show (None for all).
|
||||
show_line_numbers: Include line numbers in output.
|
||||
highlight_match: Add markers for matched region.
|
||||
|
||||
Returns:
|
||||
Formatted code string.
|
||||
"""
|
||||
content = result.content or result.excerpt or ""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
lines = content.splitlines()
|
||||
|
||||
# Truncate if needed
|
||||
truncated = False
|
||||
if max_lines and len(lines) > max_lines:
|
||||
lines = lines[:max_lines]
|
||||
truncated = True
|
||||
|
||||
# Format with line numbers
|
||||
if show_line_numbers:
|
||||
start = result.start_line or 1
|
||||
formatted_lines = []
|
||||
for i, line in enumerate(lines):
|
||||
line_num = start + i
|
||||
formatted_lines.append(f"{line_num:4d} | {line}")
|
||||
output = "\n".join(formatted_lines)
|
||||
else:
|
||||
output = "\n".join(lines)
|
||||
|
||||
if truncated:
|
||||
output += "\n... (truncated)"
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_code_block_summary(result: SearchResult) -> str:
|
||||
"""Get a concise summary of a code block.
|
||||
|
||||
Args:
|
||||
result: SearchResult to summarize.
|
||||
|
||||
Returns:
|
||||
Summary string like "function hello_world (lines 10-25)"
|
||||
"""
|
||||
parts = []
|
||||
|
||||
if result.symbol_kind:
|
||||
parts.append(result.symbol_kind)
|
||||
|
||||
if result.symbol_name:
|
||||
parts.append(f"`{result.symbol_name}`")
|
||||
elif result.excerpt:
|
||||
# Extract first meaningful identifier
|
||||
first_line = result.excerpt.split("\n")[0][:50]
|
||||
parts.append(f'"{first_line}..."')
|
||||
|
||||
if result.start_line and result.end_line:
|
||||
if result.start_line == result.end_line:
|
||||
parts.append(f"(line {result.start_line})")
|
||||
else:
|
||||
parts.append(f"(lines {result.start_line}-{result.end_line})")
|
||||
|
||||
if result.path:
|
||||
file_name = Path(result.path).name
|
||||
parts.append(f"in {file_name}")
|
||||
|
||||
return " ".join(parts) if parts else "unknown code block"
|
||||
|
||||
|
||||
class CodeBlockResult:
|
||||
"""Enhanced search result with complete code block."""
|
||||
|
||||
def __init__(self, result: SearchResult, source_path: Optional[str] = None):
|
||||
self.result = result
|
||||
self.source_path = source_path or result.path
|
||||
self._full_code: Optional[str] = None
|
||||
|
||||
@property
|
||||
def score(self) -> float:
|
||||
return self.result.score
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
return self.result.path
|
||||
|
||||
@property
|
||||
def file_name(self) -> str:
|
||||
return Path(self.result.path).name
|
||||
|
||||
@property
|
||||
def symbol_name(self) -> Optional[str]:
|
||||
return self.result.symbol_name
|
||||
|
||||
@property
|
||||
def symbol_kind(self) -> Optional[str]:
|
||||
return self.result.symbol_kind
|
||||
|
||||
@property
|
||||
def line_range(self) -> Tuple[int, int]:
|
||||
return (
|
||||
self.result.start_line or 1,
|
||||
self.result.end_line or 1
|
||||
)
|
||||
|
||||
@property
|
||||
def full_code(self) -> str:
|
||||
"""Get full code block content."""
|
||||
if self._full_code is None:
|
||||
self._full_code = extract_complete_code_block(self.result, self.source_path)
|
||||
return self._full_code
|
||||
|
||||
@property
|
||||
def excerpt(self) -> str:
|
||||
"""Get short excerpt."""
|
||||
return self.result.excerpt or ""
|
||||
|
||||
@property
|
||||
def summary(self) -> str:
|
||||
"""Get code block summary."""
|
||||
return get_code_block_summary(self.result)
|
||||
|
||||
def format(
|
||||
self,
|
||||
max_lines: Optional[int] = None,
|
||||
show_line_numbers: bool = True,
|
||||
) -> str:
|
||||
"""Format code for display."""
|
||||
# Use full code if available
|
||||
display_result = SearchResult(
|
||||
path=self.result.path,
|
||||
score=self.result.score,
|
||||
content=self.full_code,
|
||||
start_line=self.result.start_line,
|
||||
end_line=self.result.end_line,
|
||||
)
|
||||
return format_search_result_code(
|
||||
display_result,
|
||||
max_lines=max_lines,
|
||||
show_line_numbers=show_line_numbers
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CodeBlockResult {self.summary} score={self.score:.3f}>"
|
||||
|
||||
|
||||
def enhance_search_results(
|
||||
results: List[SearchResult],
|
||||
) -> List[CodeBlockResult]:
|
||||
"""Enhance search results with complete code block access.
|
||||
|
||||
Args:
|
||||
results: List of SearchResult from semantic search.
|
||||
|
||||
Returns:
|
||||
List of CodeBlockResult with full code access.
|
||||
"""
|
||||
return [CodeBlockResult(r) for r in results]
|
||||
@@ -1,17 +1,14 @@
|
||||
"""Embedder for semantic code search."""
|
||||
"""Embedder for semantic code search using fastembed."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, List
|
||||
|
||||
from . import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
|
||||
|
||||
if SEMANTIC_AVAILABLE:
|
||||
import numpy as np
|
||||
from . import SEMANTIC_AVAILABLE
|
||||
|
||||
|
||||
class Embedder:
|
||||
"""Generate embeddings for code chunks using fastembed or sentence-transformers."""
|
||||
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
|
||||
|
||||
MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
||||
EMBEDDING_DIM = 384
|
||||
@@ -25,19 +22,14 @@ class Embedder:
|
||||
|
||||
self.model_name = model_name or self.MODEL_NAME
|
||||
self._model = None
|
||||
self._backend = SEMANTIC_BACKEND
|
||||
|
||||
def _load_model(self) -> None:
|
||||
"""Lazy load the embedding model."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
if self._backend == "fastembed":
|
||||
from fastembed import TextEmbedding
|
||||
self._model = TextEmbedding(model_name=self.model_name)
|
||||
else:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self._model = SentenceTransformer(self.model_name)
|
||||
from fastembed import TextEmbedding
|
||||
self._model = TextEmbedding(model_name=self.model_name)
|
||||
|
||||
def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for one or more texts.
|
||||
@@ -55,12 +47,8 @@ class Embedder:
|
||||
else:
|
||||
texts = list(texts)
|
||||
|
||||
if self._backend == "fastembed":
|
||||
embeddings = list(self._model.embed(texts))
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
else:
|
||||
embeddings = self._model.encode(texts)
|
||||
return embeddings.tolist()
|
||||
embeddings = list(self._model.embed(texts))
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
def embed_single(self, text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
|
||||
@@ -119,6 +119,7 @@ class VectorStore:
|
||||
query_embedding: List[float],
|
||||
top_k: int = 10,
|
||||
min_score: float = 0.0,
|
||||
return_full_content: bool = True,
|
||||
) -> List[SearchResult]:
|
||||
"""Find chunks most similar to query embedding.
|
||||
|
||||
@@ -126,6 +127,7 @@ class VectorStore:
|
||||
query_embedding: Query vector.
|
||||
top_k: Maximum results to return.
|
||||
min_score: Minimum similarity score (0-1).
|
||||
return_full_content: If True, return full code block content.
|
||||
|
||||
Returns:
|
||||
List of SearchResult ordered by similarity (highest first).
|
||||
@@ -144,14 +146,39 @@ class VectorStore:
|
||||
if score >= min_score:
|
||||
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||
|
||||
# Build excerpt
|
||||
# Build excerpt (short preview)
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
# Extract symbol information from metadata
|
||||
symbol_name = metadata.get("symbol_name")
|
||||
symbol_kind = metadata.get("symbol_kind")
|
||||
start_line = metadata.get("start_line")
|
||||
end_line = metadata.get("end_line")
|
||||
|
||||
# Build Symbol object if we have symbol info
|
||||
symbol = None
|
||||
if symbol_name and symbol_kind and start_line and end_line:
|
||||
try:
|
||||
from codexlens.entities import Symbol
|
||||
symbol = Symbol(
|
||||
name=symbol_name,
|
||||
kind=symbol_kind,
|
||||
range=(start_line, end_line)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results.append((score, SearchResult(
|
||||
path=file_path,
|
||||
score=score,
|
||||
excerpt=excerpt,
|
||||
symbol=None,
|
||||
content=content if return_full_content else None,
|
||||
symbol=symbol,
|
||||
metadata=metadata,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
symbol_name=symbol_name,
|
||||
symbol_kind=symbol_kind,
|
||||
)))
|
||||
|
||||
# Sort by score descending
|
||||
|
||||
Reference in New Issue
Block a user