Files
Claude-Code-Workflow/codex-lens/tests/test_recursive_splitting.py
catlog22 e671b45948 feat: Enhance configuration management and embedding capabilities
- Added JSON-based settings management in Config class for embedding and LLM configurations.
- Introduced methods to save and load settings from a JSON file.
- Updated BaseEmbedder and its subclasses to include max_tokens property for better token management.
- Enhanced chunking strategy to support recursive splitting of large symbols with improved overlap handling.
- Implemented comprehensive tests for recursive splitting and chunking behavior.
- Added CLI tools configuration management for better integration with external tools.
- Introduced a new command for compacting session memory into structured text for recovery.
2025-12-24 16:32:27 +08:00

292 lines
9.3 KiB
Python

"""Tests for recursive splitting of large symbols in chunker."""
import pytest
from codexlens.entities import Symbol
from codexlens.semantic.chunker import Chunker, ChunkConfig
class TestRecursiveSplitting:
"""Test cases for recursive splitting of large symbols."""
def test_small_symbol_no_split(self):
"""Test that small symbols are not split."""
config = ChunkConfig(max_chunk_size=1000, overlap=100)
chunker = Chunker(config)
content = '''def small_function():
# This is a small function
x = 1
y = 2
return x + y
'''
symbols = [Symbol(name='small_function', kind='function', range=(1, 5))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
assert len(chunks) == 1
assert chunks[0].metadata['strategy'] == 'symbol'
assert chunks[0].metadata['symbol_name'] == 'small_function'
assert chunks[0].metadata['symbol_kind'] == 'function'
assert 'parent_symbol_range' not in chunks[0].metadata
def test_large_symbol_splits(self):
"""Test that large symbols are recursively split."""
config = ChunkConfig(max_chunk_size=100, overlap=20)
chunker = Chunker(config)
content = '''def large_function():
# Line 1
# Line 2
# Line 3
# Line 4
# Line 5
# Line 6
# Line 7
# Line 8
# Line 9
# Line 10
# Line 11
# Line 12
# Line 13
# Line 14
# Line 15
pass
'''
symbols = [Symbol(name='large_function', kind='function', range=(1, 18))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Should be split into multiple chunks
assert len(chunks) > 1
# All chunks should have symbol metadata
for chunk in chunks:
assert chunk.metadata['strategy'] == 'symbol_split'
assert chunk.metadata['symbol_name'] == 'large_function'
assert chunk.metadata['symbol_kind'] == 'function'
assert chunk.metadata['parent_symbol_range'] == (1, 18)
def test_boundary_condition(self):
"""Test symbol exactly at max_chunk_size boundary."""
config = ChunkConfig(max_chunk_size=90, overlap=20)
chunker = Chunker(config)
content = '''def boundary_function():
# This function is exactly at boundary
x = 1
y = 2
return x + y
'''
symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Content is slightly over 90 chars, should be split
assert len(chunks) >= 1
assert chunks[0].metadata['strategy'] == 'symbol_split'
def test_multiple_symbols_mixed_sizes(self):
"""Test chunking with multiple symbols of different sizes."""
config = ChunkConfig(max_chunk_size=150, overlap=30)
chunker = Chunker(config)
content = '''def small():
return 1
def medium():
# Medium function
x = 1
y = 2
z = 3
return x + y + z
def very_large():
# Line 1
# Line 2
# Line 3
# Line 4
# Line 5
# Line 6
# Line 7
# Line 8
# Line 9
# Line 10
# Line 11
# Line 12
# Line 13
# Line 14
# Line 15
pass
'''
symbols = [
Symbol(name='small', kind='function', range=(1, 2)),
Symbol(name='medium', kind='function', range=(4, 9)),
Symbol(name='very_large', kind='function', range=(11, 28)),
]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Find chunks for each symbol
small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small']
medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium']
large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large']
# Small should be filtered (< min_chunk_size)
assert len(small_chunks) == 0
# Medium should not be split
assert len(medium_chunks) == 1
assert medium_chunks[0].metadata['strategy'] == 'symbol'
# Large should be split
assert len(large_chunks) > 1
for chunk in large_chunks:
assert chunk.metadata['strategy'] == 'symbol_split'
def test_line_numbers_preserved(self):
"""Test that line numbers are correctly preserved in sub-chunks."""
config = ChunkConfig(max_chunk_size=100, overlap=20)
chunker = Chunker(config)
content = '''def large_function():
# Line 1 with some extra content to make it longer
# Line 2 with some extra content to make it longer
# Line 3 with some extra content to make it longer
# Line 4 with some extra content to make it longer
# Line 5 with some extra content to make it longer
# Line 6 with some extra content to make it longer
# Line 7 with some extra content to make it longer
# Line 8 with some extra content to make it longer
# Line 9 with some extra content to make it longer
# Line 10 with some extra content to make it longer
pass
'''
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Verify line numbers are correct and sequential
assert len(chunks) > 1
assert chunks[0].metadata['start_line'] == 1
# Each chunk should have valid line numbers
for chunk in chunks:
assert chunk.metadata['start_line'] >= 1
assert chunk.metadata['end_line'] <= 13
assert chunk.metadata['start_line'] <= chunk.metadata['end_line']
def test_overlap_in_split_chunks(self):
"""Test that overlap is applied when splitting large symbols."""
config = ChunkConfig(max_chunk_size=100, overlap=30)
chunker = Chunker(config)
content = '''def large_function():
# Line 1
# Line 2
# Line 3
# Line 4
# Line 5
# Line 6
# Line 7
# Line 8
# Line 9
# Line 10
# Line 11
# Line 12
pass
'''
symbols = [Symbol(name='large_function', kind='function', range=(1, 14))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# With overlap, consecutive chunks should overlap
if len(chunks) > 1:
for i in range(len(chunks) - 1):
# Next chunk should start before current chunk ends (overlap)
current_end = chunks[i].metadata['end_line']
next_start = chunks[i + 1].metadata['start_line']
# Overlap should exist
assert next_start <= current_end
def test_empty_symbol_filtered(self):
"""Test that symbols smaller than min_chunk_size are filtered."""
config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50)
chunker = Chunker(config)
content = '''def tiny():
pass
'''
symbols = [Symbol(name='tiny', kind='function', range=(1, 2))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Should be filtered due to min_chunk_size
assert len(chunks) == 0
def test_class_symbol_splits(self):
"""Test that large class symbols are also split correctly."""
config = ChunkConfig(max_chunk_size=120, overlap=25)
chunker = Chunker(config)
content = '''class LargeClass:
"""A large class with many methods."""
def method1(self):
return 1
def method2(self):
return 2
def method3(self):
return 3
def method4(self):
return 4
'''
symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Should be split
assert len(chunks) > 1
# All chunks should preserve class metadata
for chunk in chunks:
assert chunk.metadata['symbol_name'] == 'LargeClass'
assert chunk.metadata['symbol_kind'] == 'class'
assert chunk.metadata['strategy'] == 'symbol_split'
class TestLightweightMode:
"""Test recursive splitting with lightweight token counting."""
def test_large_symbol_splits_lightweight_mode(self):
"""Test that large symbols split correctly in lightweight mode."""
config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True)
chunker = Chunker(config)
content = '''def large_function():
# Line 1 with some extra content to make it longer
# Line 2 with some extra content to make it longer
# Line 3 with some extra content to make it longer
# Line 4 with some extra content to make it longer
# Line 5 with some extra content to make it longer
# Line 6 with some extra content to make it longer
# Line 7 with some extra content to make it longer
# Line 8 with some extra content to make it longer
# Line 9 with some extra content to make it longer
# Line 10 with some extra content to make it longer
pass
'''
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
# Should split even in lightweight mode
assert len(chunks) > 1
# All chunks should have token_count (estimated)
for chunk in chunks:
assert 'token_count' in chunk.metadata
assert chunk.metadata['token_count'] > 0