Claude-Code-Workflow/codex-lens/tests/test_recursive_splitting.py

"""Tests for recursive splitting of large symbols in chunker."""

import pytest
from codexlens.entities import Symbol
from codexlens.semantic.chunker import Chunker, ChunkConfig


class TestRecursiveSplitting:
    """Test cases for recursive splitting of large symbols."""

    def test_small_symbol_no_split(self):
        """Test that small symbols are not split."""
        config = ChunkConfig(max_chunk_size=1000, overlap=100)
        chunker = Chunker(config)

        content = '''def small_function():
    # This is a small function
    x = 1
    y = 2
    return x + y
'''
        symbols = [Symbol(name='small_function', kind='function', range=(1, 5))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        assert len(chunks) == 1
        assert chunks[0].metadata['strategy'] == 'symbol'
        assert chunks[0].metadata['symbol_name'] == 'small_function'
        assert chunks[0].metadata['symbol_kind'] == 'function'
        assert 'parent_symbol_range' not in chunks[0].metadata

    def test_large_symbol_splits(self):
        """Test that large symbols are recursively split."""
        config = ChunkConfig(max_chunk_size=100, overlap=20)
        chunker = Chunker(config)

        content = '''def large_function():
    # Line 1
    # Line 2
    # Line 3
    # Line 4
    # Line 5
    # Line 6
    # Line 7
    # Line 8
    # Line 9
    # Line 10
    # Line 11
    # Line 12
    # Line 13
    # Line 14
    # Line 15
    pass
'''
        symbols = [Symbol(name='large_function', kind='function', range=(1, 18))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Should be split into multiple chunks
        assert len(chunks) > 1

        # All chunks should have symbol metadata
        for chunk in chunks:
            assert chunk.metadata['strategy'] == 'symbol_split'
            assert chunk.metadata['symbol_name'] == 'large_function'
            assert chunk.metadata['symbol_kind'] == 'function'
            assert chunk.metadata['parent_symbol_range'] == (1, 18)

    def test_boundary_condition(self):
        """Test symbol exactly at max_chunk_size boundary."""
        config = ChunkConfig(max_chunk_size=90, overlap=20)
        chunker = Chunker(config)

        content = '''def boundary_function():
    # This function is exactly at boundary
    x = 1
    y = 2
    return x + y
'''
        symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Content is slightly over 90 chars, should be split
        assert len(chunks) >= 1
        assert chunks[0].metadata['strategy'] == 'symbol_split'

    def test_multiple_symbols_mixed_sizes(self):
        """Test chunking with multiple symbols of different sizes."""
        config = ChunkConfig(max_chunk_size=150, overlap=30)
        chunker = Chunker(config)

        content = '''def small():
    return 1

def medium():
    # Medium function
    x = 1
    y = 2
    z = 3
    return x + y + z

def very_large():
    # Line 1
    # Line 2
    # Line 3
    # Line 4
    # Line 5
    # Line 6
    # Line 7
    # Line 8
    # Line 9
    # Line 10
    # Line 11
    # Line 12
    # Line 13
    # Line 14
    # Line 15
    pass
'''
        symbols = [
            Symbol(name='small', kind='function', range=(1, 2)),
            Symbol(name='medium', kind='function', range=(4, 9)),
            Symbol(name='very_large', kind='function', range=(11, 28)),
        ]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Find chunks for each symbol
        small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small']
        medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium']
        large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large']

        # Small should be filtered (< min_chunk_size)
        assert len(small_chunks) == 0

        # Medium should not be split
        assert len(medium_chunks) == 1
        assert medium_chunks[0].metadata['strategy'] == 'symbol'

        # Large should be split
        assert len(large_chunks) > 1
        for chunk in large_chunks:
            assert chunk.metadata['strategy'] == 'symbol_split'

    def test_line_numbers_preserved(self):
        """Test that line numbers are correctly preserved in sub-chunks."""
        config = ChunkConfig(max_chunk_size=100, overlap=20)
        chunker = Chunker(config)

        content = '''def large_function():
    # Line 1 with some extra content to make it longer
    # Line 2 with some extra content to make it longer
    # Line 3 with some extra content to make it longer
    # Line 4 with some extra content to make it longer
    # Line 5 with some extra content to make it longer
    # Line 6 with some extra content to make it longer
    # Line 7 with some extra content to make it longer
    # Line 8 with some extra content to make it longer
    # Line 9 with some extra content to make it longer
    # Line 10 with some extra content to make it longer
    pass
'''
        symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Verify line numbers are correct and sequential
        assert len(chunks) > 1
        assert chunks[0].metadata['start_line'] == 1

        # Each chunk should have valid line numbers
        for chunk in chunks:
            assert chunk.metadata['start_line'] >= 1
            assert chunk.metadata['end_line'] <= 13
            assert chunk.metadata['start_line'] <= chunk.metadata['end_line']

    def test_overlap_in_split_chunks(self):
        """Test that overlap is applied when splitting large symbols."""
        config = ChunkConfig(max_chunk_size=100, overlap=30)
        chunker = Chunker(config)

        content = '''def large_function():
    # Line 1
    # Line 2
    # Line 3
    # Line 4
    # Line 5
    # Line 6
    # Line 7
    # Line 8
    # Line 9
    # Line 10
    # Line 11
    # Line 12
    pass
'''
        symbols = [Symbol(name='large_function', kind='function', range=(1, 14))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # With overlap, consecutive chunks should overlap
        if len(chunks) > 1:
            for i in range(len(chunks) - 1):
                # Next chunk should start before current chunk ends (overlap)
                current_end = chunks[i].metadata['end_line']
                next_start = chunks[i + 1].metadata['start_line']
                # Overlap should exist
                assert next_start <= current_end

    def test_empty_symbol_filtered(self):
        """Test that symbols smaller than min_chunk_size are filtered."""
        config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50)
        chunker = Chunker(config)

        content = '''def tiny():
    pass
'''
        symbols = [Symbol(name='tiny', kind='function', range=(1, 2))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Should be filtered due to min_chunk_size
        assert len(chunks) == 0

    def test_class_symbol_splits(self):
        """Test that large class symbols are also split correctly."""
        config = ChunkConfig(max_chunk_size=120, overlap=25)
        chunker = Chunker(config)

        content = '''class LargeClass:
    """A large class with many methods."""

    def method1(self):
        return 1

    def method2(self):
        return 2

    def method3(self):
        return 3

    def method4(self):
        return 4
'''
        symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Should be split
        assert len(chunks) > 1

        # All chunks should preserve class metadata
        for chunk in chunks:
            assert chunk.metadata['symbol_name'] == 'LargeClass'
            assert chunk.metadata['symbol_kind'] == 'class'
            assert chunk.metadata['strategy'] == 'symbol_split'


class TestLightweightMode:
    """Test recursive splitting with lightweight token counting."""

    def test_large_symbol_splits_lightweight_mode(self):
        """Test that large symbols split correctly in lightweight mode."""
        config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True)
        chunker = Chunker(config)

        content = '''def large_function():
    # Line 1 with some extra content to make it longer
    # Line 2 with some extra content to make it longer
    # Line 3 with some extra content to make it longer
    # Line 4 with some extra content to make it longer
    # Line 5 with some extra content to make it longer
    # Line 6 with some extra content to make it longer
    # Line 7 with some extra content to make it longer
    # Line 8 with some extra content to make it longer
    # Line 9 with some extra content to make it longer
    # Line 10 with some extra content to make it longer
    pass
'''
        symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]

        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')

        # Should split even in lightweight mode
        assert len(chunks) > 1

        # All chunks should have token_count (estimated)
        for chunk in chunks:
            assert 'token_count' in chunk.metadata
            assert chunk.metadata['token_count'] > 0