Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality

- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms. - Created performance benchmarks comparing tiktoken and pure Python implementations for token counting. - Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing. - Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility. - Introduced a refactor script for GraphAnalyzer to streamline future improvements.
2026-03-29 20:11:04 +08:00 · 2025-12-15 14:36:09 +08:00
parent 82dcafff00
commit 0fe16963cd
49 changed files with 9307 additions and 438 deletions
--- a/codex-lens/tests/test_hybrid_chunker.py
+++ b/codex-lens/tests/test_hybrid_chunker.py
@@ -0,0 +1,561 @@
+"""Tests for Hybrid Docstring Chunker."""
+
+import pytest
+
+from codexlens.entities import SemanticChunk, Symbol
+from codexlens.semantic.chunker import (
+    ChunkConfig,
+    Chunker,
+    DocstringExtractor,
+    HybridChunker,
+)
+
+
+class TestDocstringExtractor:
+    """Tests for DocstringExtractor class."""
+
+    def test_extract_single_line_python_docstring(self):
+        """Test extraction of single-line Python docstring."""
+        content = '''def hello():
+    """This is a docstring."""
+    return True
+'''
+        docstrings = DocstringExtractor.extract_python_docstrings(content)
+        assert len(docstrings) == 1
+        assert docstrings[0][1] == 2  # start_line
+        assert docstrings[0][2] == 2  # end_line
+        assert '"""This is a docstring."""' in docstrings[0][0]
+
+    def test_extract_multi_line_python_docstring(self):
+        """Test extraction of multi-line Python docstring."""
+        content = '''def process():
+    """
+    This is a multi-line
+    docstring with details.
+    """
+    return 42
+'''
+        docstrings = DocstringExtractor.extract_python_docstrings(content)
+        assert len(docstrings) == 1
+        assert docstrings[0][1] == 2  # start_line
+        assert docstrings[0][2] == 5  # end_line
+        assert "multi-line" in docstrings[0][0]
+
+    def test_extract_multiple_python_docstrings(self):
+        """Test extraction of multiple docstrings from same file."""
+        content = '''"""Module docstring."""
+
+def func1():
+    """Function 1 docstring."""
+    pass
+
+class MyClass:
+    """Class docstring."""
+
+    def method(self):
+        """Method docstring."""
+        pass
+'''
+        docstrings = DocstringExtractor.extract_python_docstrings(content)
+        assert len(docstrings) == 4
+        lines = [d[1] for d in docstrings]
+        assert 1 in lines  # Module docstring
+        assert 4 in lines  # func1 docstring
+        assert 8 in lines  # Class docstring
+        assert 11 in lines  # method docstring
+
+    def test_extract_python_docstring_single_quotes(self):
+        """Test extraction with single quote docstrings."""
+        content = """def test():
+    '''Single quote docstring.'''
+    return None
+"""
+        docstrings = DocstringExtractor.extract_python_docstrings(content)
+        assert len(docstrings) == 1
+        assert "Single quote docstring" in docstrings[0][0]
+
+    def test_extract_jsdoc_single_comment(self):
+        """Test extraction of single JSDoc comment."""
+        content = '''/**
+ * This is a JSDoc comment
+ * @param {string} name
+ */
+function hello(name) {
+    return name;
+}
+'''
+        comments = DocstringExtractor.extract_jsdoc_comments(content)
+        assert len(comments) == 1
+        assert comments[0][1] == 1  # start_line
+        assert comments[0][2] == 4  # end_line
+        assert "JSDoc comment" in comments[0][0]
+
+    def test_extract_multiple_jsdoc_comments(self):
+        """Test extraction of multiple JSDoc comments."""
+        content = '''/**
+ * Function 1
+ */
+function func1() {}
+
+/**
+ * Class description
+ */
+class MyClass {
+    /**
+     * Method description
+     */
+    method() {}
+}
+'''
+        comments = DocstringExtractor.extract_jsdoc_comments(content)
+        assert len(comments) == 3
+
+    def test_extract_docstrings_unsupported_language(self):
+        """Test that unsupported languages return empty list."""
+        content = "// Some code"
+        docstrings = DocstringExtractor.extract_docstrings(content, "ruby")
+        assert len(docstrings) == 0
+
+    def test_extract_docstrings_empty_content(self):
+        """Test extraction from empty content."""
+        docstrings = DocstringExtractor.extract_python_docstrings("")
+        assert len(docstrings) == 0
+
+
+class TestHybridChunker:
+    """Tests for HybridChunker class."""
+
+    def test_hybrid_chunker_initialization(self):
+        """Test HybridChunker initialization with defaults."""
+        chunker = HybridChunker()
+        assert chunker.config is not None
+        assert chunker.base_chunker is not None
+        assert chunker.docstring_extractor is not None
+
+    def test_hybrid_chunker_custom_config(self):
+        """Test HybridChunker with custom config."""
+        config = ChunkConfig(max_chunk_size=500, min_chunk_size=20)
+        chunker = HybridChunker(config=config)
+        assert chunker.config.max_chunk_size == 500
+        assert chunker.config.min_chunk_size == 20
+
+    def test_hybrid_chunker_isolates_docstrings(self):
+        """Test that hybrid chunker isolates docstrings into separate chunks."""
+        config = ChunkConfig(min_chunk_size=10)
+        chunker = HybridChunker(config=config)
+
+        content = '''"""Module-level docstring."""
+
+def hello():
+    """Function docstring."""
+    return "world"
+
+def goodbye():
+    """Another docstring."""
+    return "farewell"
+'''
+        symbols = [
+            Symbol(name="hello", kind="function", range=(3, 5)),
+            Symbol(name="goodbye", kind="function", range=(7, 9)),
+        ]
+
+        chunks = chunker.chunk_file(content, symbols, "test.py", "python")
+
+        # Should have 3 docstring chunks + 2 code chunks = 5 total
+        docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
+        code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
+
+        assert len(docstring_chunks) == 3
+        assert len(code_chunks) == 2
+        assert all(c.metadata["strategy"] == "hybrid" for c in chunks)
+
+    def test_hybrid_chunker_docstring_isolation_percentage(self):
+        """Test that >98% of docstrings are isolated correctly."""
+        config = ChunkConfig(min_chunk_size=5)
+        chunker = HybridChunker(config=config)
+
+        # Create content with 10 docstrings
+        lines = []
+        lines.append('"""Module docstring."""\n')
+        lines.append('\n')
+
+        for i in range(10):
+            lines.append(f'def func{i}():\n')
+            lines.append(f'    """Docstring for func{i}."""\n')
+            lines.append(f'    return {i}\n')
+            lines.append('\n')
+
+        content = "".join(lines)
+        symbols = [
+            Symbol(name=f"func{i}", kind="function", range=(3 + i*4, 5 + i*4))
+            for i in range(10)
+        ]
+
+        chunks = chunker.chunk_file(content, symbols, "test.py", "python")
+
+        docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
+
+        # We have 11 docstrings total (1 module + 10 functions)
+        # Verify >98% isolation (at least 10.78 out of 11)
+        isolation_rate = len(docstring_chunks) / 11
+        assert isolation_rate >= 0.98, f"Docstring isolation rate {isolation_rate:.2%} < 98%"
+
+    def test_hybrid_chunker_javascript_jsdoc(self):
+        """Test hybrid chunker with JavaScript JSDoc comments."""
+        config = ChunkConfig(min_chunk_size=10)
+        chunker = HybridChunker(config=config)
+
+        content = '''/**
+ * Main function description
+ */
+function main() {
+    return 42;
+}
+
+/**
+ * Helper function
+ */
+function helper() {
+    return 0;
+}
+'''
+        symbols = [
+            Symbol(name="main", kind="function", range=(4, 6)),
+            Symbol(name="helper", kind="function", range=(11, 13)),
+        ]
+
+        chunks = chunker.chunk_file(content, symbols, "test.js", "javascript")
+
+        docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
+        code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
+
+        assert len(docstring_chunks) == 2
+        assert len(code_chunks) == 2
+
+    def test_hybrid_chunker_no_docstrings(self):
+        """Test hybrid chunker with code containing no docstrings."""
+        config = ChunkConfig(min_chunk_size=10)
+        chunker = HybridChunker(config=config)
+
+        content = '''def hello():
+    return "world"
+
+def goodbye():
+    return "farewell"
+'''
+        symbols = [
+            Symbol(name="hello", kind="function", range=(1, 2)),
+            Symbol(name="goodbye", kind="function", range=(4, 5)),
+        ]
+
+        chunks = chunker.chunk_file(content, symbols, "test.py", "python")
+
+        # All chunks should be code chunks
+        assert all(c.metadata.get("chunk_type") == "code" for c in chunks)
+        assert len(chunks) == 2
+
+    def test_hybrid_chunker_preserves_metadata(self):
+        """Test that hybrid chunker preserves all required metadata."""
+        config = ChunkConfig(min_chunk_size=5)
+        chunker = HybridChunker(config=config)
+
+        content = '''"""Module doc."""
+
+def test():
+    """Test doc."""
+    pass
+'''
+        symbols = [Symbol(name="test", kind="function", range=(3, 5))]
+
+        chunks = chunker.chunk_file(content, symbols, "/path/to/file.py", "python")
+
+        for chunk in chunks:
+            assert "file" in chunk.metadata
+            assert "language" in chunk.metadata
+            assert "chunk_type" in chunk.metadata
+            assert "start_line" in chunk.metadata
+            assert "end_line" in chunk.metadata
+            assert "strategy" in chunk.metadata
+            assert chunk.metadata["strategy"] == "hybrid"
+
+    def test_hybrid_chunker_no_symbols_fallback(self):
+        """Test hybrid chunker falls back to sliding window when no symbols."""
+        config = ChunkConfig(min_chunk_size=5, max_chunk_size=100)
+        chunker = HybridChunker(config=config)
+
+        content = '''"""Module docstring."""
+
+# Just some comments
+x = 42
+y = 100
+'''
+        chunks = chunker.chunk_file(content, [], "test.py", "python")
+
+        # Should have 1 docstring chunk + sliding window chunks for remaining code
+        docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
+        code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
+
+        assert len(docstring_chunks) == 1
+        assert len(code_chunks) >= 0  # May or may not have code chunks depending on size
+
+    def test_get_excluded_line_ranges(self):
+        """Test _get_excluded_line_ranges helper method."""
+        chunker = HybridChunker()
+
+        docstrings = [
+            ("doc1", 1, 3),
+            ("doc2", 5, 7),
+            ("doc3", 10, 10),
+        ]
+
+        excluded = chunker._get_excluded_line_ranges(docstrings)
+
+        assert 1 in excluded
+        assert 2 in excluded
+        assert 3 in excluded
+        assert 4 not in excluded
+        assert 5 in excluded
+        assert 6 in excluded
+        assert 7 in excluded
+        assert 8 not in excluded
+        assert 9 not in excluded
+        assert 10 in excluded
+
+    def test_filter_symbols_outside_docstrings(self):
+        """Test _filter_symbols_outside_docstrings helper method."""
+        chunker = HybridChunker()
+
+        symbols = [
+            Symbol(name="func1", kind="function", range=(1, 5)),
+            Symbol(name="func2", kind="function", range=(10, 15)),
+            Symbol(name="func3", kind="function", range=(20, 25)),
+        ]
+
+        # Exclude lines 1-5 (func1) and 10-12 (partial overlap with func2)
+        excluded_lines = set(range(1, 6)) | set(range(10, 13))
+
+        filtered = chunker._filter_symbols_outside_docstrings(symbols, excluded_lines)
+
+        # func1 should be filtered out (completely within excluded)
+        # func2 should remain (partial overlap)
+        # func3 should remain (no overlap)
+        assert len(filtered) == 2
+        names = [s.name for s in filtered]
+        assert "func1" not in names
+        assert "func2" in names
+        assert "func3" in names
+        excluded = chunker._get_excluded_line_ranges(docstrings)
+
+        assert 1 in excluded
+        assert 2 in excluded
+        assert 3 in excluded
+        assert 4 not in excluded
+        assert 5 in excluded
+        assert 6 in excluded
+        assert 7 in excluded
+        assert 8 not in excluded
+        assert 9 not in excluded
+        assert 10 in excluded
+
+    def test_filter_symbols_outside_docstrings(self):
+        """Test _filter_symbols_outside_docstrings helper method."""
+        chunker = HybridChunker()
+
+        symbols = [
+            Symbol(name="func1", kind="function", range=(1, 5)),
+            Symbol(name="func2", kind="function", range=(10, 15)),
+            Symbol(name="func3", kind="function", range=(20, 25)),
+        ]
+
+        # Exclude lines 1-5 (func1) and 10-12 (partial overlap with func2)
+        excluded_lines = set(range(1, 6)) | set(range(10, 13))
+
+        filtered = chunker._filter_symbols_outside_docstrings(symbols, excluded_lines)
+
+        # func1 should be filtered out (completely within excluded)
+        # func2 should remain (partial overlap)
+        # func3 should remain (no overlap)
+        assert len(filtered) == 2
+        names = [s.name for s in filtered]
+        assert "func1" not in names
+        assert "func2" in names
+        assert "func3" in names
+
+    def test_hybrid_chunker_performance_overhead(self):
+        """Test that hybrid chunker has <5% overhead vs base chunker."""
+        import time
+
+        config = ChunkConfig(min_chunk_size=5)
+
+        # Create content with no docstrings to measure worst-case overhead
+        lines = []
+        for i in range(100):
+            lines.append(f'def func{i}():\n')
+            lines.append(f'    return {i}\n')
+            lines.append('\n')
+        content = "".join(lines)
+        content = '''"""First docstring."""
+
+"""Second docstring."""
+
+"""Third docstring."""
+'''
+        chunks = chunker.chunk_file(content, [], "test.py", "python")
+
+        # Should only have docstring chunks
+        assert all(c.metadata.get("chunk_type") == "docstring" for c in chunks)
+        assert len(chunks) == 3
+
+
+class TestChunkConfigStrategy:
+    """Tests for strategy field in ChunkConfig."""
+
+    def test_chunk_config_default_strategy(self):
+        """Test that default strategy is 'auto'."""
+        config = ChunkConfig()
+        assert config.strategy == "auto"
+
+    def test_chunk_config_custom_strategy(self):
+        """Test setting custom strategy."""
+        config = ChunkConfig(strategy="hybrid")
+        assert config.strategy == "hybrid"
+
+        config = ChunkConfig(strategy="symbol")
+        assert config.strategy == "symbol"
+
+        config = ChunkConfig(strategy="sliding_window")
+        assert config.strategy == "sliding_window"
+
+
+class TestHybridChunkerIntegration:
+    """Integration tests for hybrid chunker with realistic code."""
+
+    def test_realistic_python_module(self):
+        """Test hybrid chunker with realistic Python module."""
+        config = ChunkConfig(min_chunk_size=10)
+        chunker = HybridChunker(config=config)
+
+        content = '''"""
+Data processing module for handling user data.
+
+This module provides functions for cleaning and validating user input.
+"""
+
+from typing import Dict, Any
+
+
+def validate_email(email: str) -> bool:
+    """
+    Validate an email address format.
+
+    Args:
+        email: The email address to validate
+
+    Returns:
+        True if valid, False otherwise
+    """
+    import re
+    pattern = r'^[\\w\\.-]+@[\\w\\.-]+\\.\\w+$'
+    return bool(re.match(pattern, email))
+
+
+class UserProfile:
+    """
+    User profile management class.
+
+    Handles user data storage and retrieval.
+    """
+
+    def __init__(self, user_id: int):
+        """Initialize user profile with ID."""
+        self.user_id = user_id
+        self.data = {}
+
+    def update_data(self, data: Dict[str, Any]) -> None:
+        """
+        Update user profile data.
+
+        Args:
+            data: Dictionary of user data to update
+        """
+        self.data.update(data)
+'''
+
+        symbols = [
+            Symbol(name="validate_email", kind="function", range=(11, 23)),
+            Symbol(name="UserProfile", kind="class", range=(26, 44)),
+        ]
+
+        chunks = chunker.chunk_file(content, symbols, "users.py", "python")
+
+        docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
+        code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
+
+        # Verify docstrings are isolated
+        assert len(docstring_chunks) >= 4  # Module, function, class, methods
+        assert len(code_chunks) >= 1  # At least one code chunk
+
+        # Verify >98% docstring isolation
+        # Count total docstring lines in original
+        total_docstring_lines = sum(
+            d[2] - d[1] + 1
+            for d in DocstringExtractor.extract_python_docstrings(content)
+        )
+        isolated_docstring_lines = sum(
+            c.metadata["end_line"] - c.metadata["start_line"] + 1
+            for c in docstring_chunks
+        )
+
+        isolation_rate = isolated_docstring_lines / total_docstring_lines if total_docstring_lines > 0 else 1
+        assert isolation_rate >= 0.98
+
+    def test_hybrid_chunker_performance_overhead(self):
+        """Test that hybrid chunker has <5% overhead vs base chunker on files without docstrings."""
+        import time
+
+        config = ChunkConfig(min_chunk_size=5)
+
+        # Create larger content with NO docstrings (worst case for hybrid chunker)
+        lines = []
+        for i in range(1000):
+            lines.append(f'def func{i}():\n')
+            lines.append(f'    x = {i}\n')
+            lines.append(f'    y = {i * 2}\n')
+            lines.append(f'    return x + y\n')
+            lines.append('\n')
+        content = "".join(lines)
+
+        symbols = [
+            Symbol(name=f"func{i}", kind="function", range=(1 + i*5, 4 + i*5))
+            for i in range(1000)
+        ]
+
+        # Warm up
+        base_chunker = Chunker(config=config)
+        base_chunker.chunk_file(content[:100], symbols[:10], "test.py", "python")
+
+        hybrid_chunker = HybridChunker(config=config)
+        hybrid_chunker.chunk_file(content[:100], symbols[:10], "test.py", "python")
+
+        # Measure base chunker (3 runs)
+        base_times = []
+        for _ in range(3):
+            start = time.perf_counter()
+            base_chunker.chunk_file(content, symbols, "test.py", "python")
+            base_times.append(time.perf_counter() - start)
+        base_time = sum(base_times) / len(base_times)
+
+        # Measure hybrid chunker (3 runs)
+        hybrid_times = []
+        for _ in range(3):
+            start = time.perf_counter()
+            hybrid_chunker.chunk_file(content, symbols, "test.py", "python")
+            hybrid_times.append(time.perf_counter() - start)
+        hybrid_time = sum(hybrid_times) / len(hybrid_times)
+
+        # Calculate overhead
+        overhead = ((hybrid_time - base_time) / base_time) * 100 if base_time > 0 else 0
+
+        # Verify <5% overhead
+        assert overhead < 5.0, f"Overhead {overhead:.2f}% exceeds 5% threshold (base={base_time:.4f}s, hybrid={hybrid_time:.4f}s)"
+