Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality

- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms. - Created performance benchmarks comparing tiktoken and pure Python implementations for token counting. - Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing. - Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility. - Introduced a refactor script for GraphAnalyzer to streamline future improvements.
2026-02-11 02:33:51 +08:00 · 2025-12-15 14:36:09 +08:00
parent 82dcafff00
commit 0fe16963cd
49 changed files with 9307 additions and 438 deletions
--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -4,9 +4,10 @@ from __future__ import annotations

 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple

 from codexlens.entities import SemanticChunk, Symbol
+from codexlens.parsers.tokenizer import get_default_tokenizer


@dataclass
@@ -14,6 +15,7 @@ class ChunkConfig:
    """Configuration for chunking strategies."""
    max_chunk_size: int = 1000  # Max characters per chunk
    overlap: int = 100  # Overlap for sliding window
+    strategy: str = "auto"  # Chunking strategy: auto, symbol, sliding_window, hybrid
    min_chunk_size: int = 50  # Minimum chunk size


@@ -22,6 +24,7 @@ class Chunker:

    def __init__(self, config: ChunkConfig | None = None) -> None:
        self.config = config or ChunkConfig()
+        self._tokenizer = get_default_tokenizer()

    def chunk_by_symbol(
        self,
@@ -29,10 +32,18 @@ class Chunker:
        symbols: List[Symbol],
        file_path: str | Path,
        language: str,
+        symbol_token_counts: Optional[dict[str, int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk code by extracted symbols (functions, classes).

        Each symbol becomes one chunk with its full content.
+
+        Args:
+            content: Source code content
+            symbols: List of extracted symbols
+            file_path: Path to source file
+            language: Programming language
+            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        chunks: List[SemanticChunk] = []
        lines = content.splitlines(keepends=True)
@@ -47,6 +58,13 @@ class Chunker:
            if len(chunk_content.strip()) < self.config.min_chunk_size:
                continue

+            # Calculate token count if not provided
+            token_count = None
+            if symbol_token_counts and symbol.name in symbol_token_counts:
+                token_count = symbol_token_counts[symbol.name]
+            else:
+                token_count = self._tokenizer.count_tokens(chunk_content)
+
            chunks.append(SemanticChunk(
                content=chunk_content,
                embedding=None,
@@ -58,6 +76,7 @@ class Chunker:
                    "start_line": start_line,
                    "end_line": end_line,
                    "strategy": "symbol",
+                    "token_count": token_count,
                }
            ))

@@ -68,10 +87,19 @@ class Chunker:
        content: str,
        file_path: str | Path,
        language: str,
+        line_mapping: Optional[List[int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk code using sliding window approach.

        Used for files without clear symbol boundaries or very long functions.
+
+        Args:
+            content: Source code content
+            file_path: Path to source file
+            language: Programming language
+            line_mapping: Optional list mapping content line indices to original line numbers
+                         (1-indexed). If provided, line_mapping[i] is the original line number
+                         for the i-th line in content.
        """
        chunks: List[SemanticChunk] = []
        lines = content.splitlines(keepends=True)
@@ -92,6 +120,18 @@ class Chunker:
            chunk_content = "".join(lines[start:end])

            if len(chunk_content.strip()) >= self.config.min_chunk_size:
+                token_count = self._tokenizer.count_tokens(chunk_content)
+
+                # Calculate correct line numbers
+                if line_mapping:
+                    # Use line mapping to get original line numbers
+                    start_line = line_mapping[start]
+                    end_line = line_mapping[end - 1]
+                else:
+                    # Default behavior: treat content as starting at line 1
+                    start_line = start + 1
+                    end_line = end
+
                chunks.append(SemanticChunk(
                    content=chunk_content,
                    embedding=None,
@@ -99,9 +139,10 @@ class Chunker:
                        "file": str(file_path),
                        "language": language,
                        "chunk_index": chunk_idx,
-                        "start_line": start + 1,
-                        "end_line": end,
+                        "start_line": start_line,
+                        "end_line": end_line,
                        "strategy": "sliding_window",
+                        "token_count": token_count,
                    }
                ))
                chunk_idx += 1
@@ -119,12 +160,239 @@ class Chunker:
        symbols: List[Symbol],
        file_path: str | Path,
        language: str,
+        symbol_token_counts: Optional[dict[str, int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk a file using the best strategy.

        Uses symbol-based chunking if symbols available,
        falls back to sliding window for files without symbols.
+
+        Args:
+            content: Source code content
+            symbols: List of extracted symbols
+            file_path: Path to source file
+            language: Programming language
+            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        if symbols:
-            return self.chunk_by_symbol(content, symbols, file_path, language)
+            return self.chunk_by_symbol(content, symbols, file_path, language, symbol_token_counts)
        return self.chunk_sliding_window(content, file_path, language)
+
+class DocstringExtractor:
+    """Extract docstrings from source code."""
+
+    @staticmethod
+    def extract_python_docstrings(content: str) -> List[Tuple[str, int, int]]:
+        """Extract Python docstrings with their line ranges.
+
+        Returns: List of (docstring_content, start_line, end_line) tuples
+        """
+        docstrings: List[Tuple[str, int, int]] = []
+        lines = content.splitlines(keepends=True)
+
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+            if stripped.startswith('"""') or stripped.startswith("'''"):
+                quote_type = '"""' if stripped.startswith('"""') else "'''"
+                start_line = i + 1
+
+                if stripped.count(quote_type) >= 2:
+                    docstring_content = line
+                    end_line = i + 1
+                    docstrings.append((docstring_content, start_line, end_line))
+                    i += 1
+                    continue
+
+                docstring_lines = [line]
+                i += 1
+                while i < len(lines):
+                    docstring_lines.append(lines[i])
+                    if quote_type in lines[i]:
+                        break
+                    i += 1
+
+                end_line = i + 1
+                docstring_content = "".join(docstring_lines)
+                docstrings.append((docstring_content, start_line, end_line))
+
+            i += 1
+
+        return docstrings
+
+    @staticmethod
+    def extract_jsdoc_comments(content: str) -> List[Tuple[str, int, int]]:
+        """Extract JSDoc comments with their line ranges.
+
+        Returns: List of (comment_content, start_line, end_line) tuples
+        """
+        comments: List[Tuple[str, int, int]] = []
+        lines = content.splitlines(keepends=True)
+
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+
+            if stripped.startswith('/**'):
+                start_line = i + 1
+                comment_lines = [line]
+                i += 1
+
+                while i < len(lines):
+                    comment_lines.append(lines[i])
+                    if '*/' in lines[i]:
+                        break
+                    i += 1
+
+                end_line = i + 1
+                comment_content = "".join(comment_lines)
+                comments.append((comment_content, start_line, end_line))
+
+            i += 1
+
+        return comments
+
+    @classmethod
+    def extract_docstrings(
+        cls,
+        content: str,
+        language: str
+    ) -> List[Tuple[str, int, int]]:
+        """Extract docstrings based on language.
+
+        Returns: List of (docstring_content, start_line, end_line) tuples
+        """
+        if language == "python":
+            return cls.extract_python_docstrings(content)
+        elif language in {"javascript", "typescript"}:
+            return cls.extract_jsdoc_comments(content)
+        return []
+
+
+class HybridChunker:
+    """Hybrid chunker that prioritizes docstrings before symbol-based chunking.
+
+    Composition-based strategy that:
+    1. Extracts docstrings as dedicated chunks
+    2. For remaining code, uses base chunker (symbol or sliding window)
+    """
+
+    def __init__(
+        self,
+        base_chunker: Chunker | None = None,
+        config: ChunkConfig | None = None
+    ) -> None:
+        """Initialize hybrid chunker.
+
+        Args:
+            base_chunker: Chunker to use for non-docstring content
+            config: Configuration for chunking
+        """
+        self.config = config or ChunkConfig()
+        self.base_chunker = base_chunker or Chunker(self.config)
+        self.docstring_extractor = DocstringExtractor()
+
+    def _get_excluded_line_ranges(
+        self,
+        docstrings: List[Tuple[str, int, int]]
+    ) -> set[int]:
+        """Get set of line numbers that are part of docstrings."""
+        excluded_lines: set[int] = set()
+        for _, start_line, end_line in docstrings:
+            for line_num in range(start_line, end_line + 1):
+                excluded_lines.add(line_num)
+        return excluded_lines
+
+    def _filter_symbols_outside_docstrings(
+        self,
+        symbols: List[Symbol],
+        excluded_lines: set[int]
+    ) -> List[Symbol]:
+        """Filter symbols to exclude those completely within docstrings."""
+        filtered: List[Symbol] = []
+        for symbol in symbols:
+            start_line, end_line = symbol.range
+            symbol_lines = set(range(start_line, end_line + 1))
+            if not symbol_lines.issubset(excluded_lines):
+                filtered.append(symbol)
+        return filtered
+
+    def chunk_file(
+        self,
+        content: str,
+        symbols: List[Symbol],
+        file_path: str | Path,
+        language: str,
+        symbol_token_counts: Optional[dict[str, int]] = None,
+    ) -> List[SemanticChunk]:
+        """Chunk file using hybrid strategy.
+
+        Extracts docstrings first, then chunks remaining code.
+
+        Args:
+            content: Source code content
+            symbols: List of extracted symbols
+            file_path: Path to source file
+            language: Programming language
+            symbol_token_counts: Optional dict mapping symbol names to token counts
+        """
+        chunks: List[SemanticChunk] = []
+        tokenizer = get_default_tokenizer()
+
+        # Step 1: Extract docstrings as dedicated chunks
+        docstrings = self.docstring_extractor.extract_docstrings(content, language)
+
+        for docstring_content, start_line, end_line in docstrings:
+            if len(docstring_content.strip()) >= self.config.min_chunk_size:
+                token_count = tokenizer.count_tokens(docstring_content)
+                chunks.append(SemanticChunk(
+                    content=docstring_content,
+                    embedding=None,
+                    metadata={
+                        "file": str(file_path),
+                        "language": language,
+                        "chunk_type": "docstring",
+                        "start_line": start_line,
+                        "end_line": end_line,
+                        "strategy": "hybrid",
+                        "token_count": token_count,
+                    }
+                ))
+
+        # Step 2: Get line ranges occupied by docstrings
+        excluded_lines = self._get_excluded_line_ranges(docstrings)
+
+        # Step 3: Filter symbols to exclude docstring-only ranges
+        filtered_symbols = self._filter_symbols_outside_docstrings(symbols, excluded_lines)
+
+        # Step 4: Chunk remaining content using base chunker
+        if filtered_symbols:
+            base_chunks = self.base_chunker.chunk_by_symbol(
+                content, filtered_symbols, file_path, language, symbol_token_counts
+            )
+            for chunk in base_chunks:
+                chunk.metadata["strategy"] = "hybrid"
+                chunk.metadata["chunk_type"] = "code"
+                chunks.append(chunk)
+        else:
+            lines = content.splitlines(keepends=True)
+            remaining_lines: List[str] = []
+
+            for i, line in enumerate(lines, start=1):
+                if i not in excluded_lines:
+                    remaining_lines.append(line)
+
+            if remaining_lines:
+                remaining_content = "".join(remaining_lines)
+                if len(remaining_content.strip()) >= self.config.min_chunk_size:
+                    base_chunks = self.base_chunker.chunk_sliding_window(
+                        remaining_content, file_path, language
+                    )
+                    for chunk in base_chunks:
+                        chunk.metadata["strategy"] = "hybrid"
+                        chunk.metadata["chunk_type"] = "code"
+                        chunks.append(chunk)
+
+        return chunks