mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
- Added JSON-based settings management in Config class for embedding and LLM configurations. - Introduced methods to save and load settings from a JSON file. - Updated BaseEmbedder and its subclasses to include max_tokens property for better token management. - Enhanced chunking strategy to support recursive splitting of large symbols with improved overlap handling. - Implemented comprehensive tests for recursive splitting and chunking behavior. - Added CLI tools configuration management for better integration with external tools. - Introduced a new command for compacting session memory into structured text for recovery.
292 lines
9.3 KiB
Python
292 lines
9.3 KiB
Python
"""Tests for recursive splitting of large symbols in chunker."""
|
|
|
|
import pytest
|
|
from codexlens.entities import Symbol
|
|
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
|
|
|
|
|
class TestRecursiveSplitting:
|
|
"""Test cases for recursive splitting of large symbols."""
|
|
|
|
def test_small_symbol_no_split(self):
|
|
"""Test that small symbols are not split."""
|
|
config = ChunkConfig(max_chunk_size=1000, overlap=100)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def small_function():
|
|
# This is a small function
|
|
x = 1
|
|
y = 2
|
|
return x + y
|
|
'''
|
|
symbols = [Symbol(name='small_function', kind='function', range=(1, 5))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0].metadata['strategy'] == 'symbol'
|
|
assert chunks[0].metadata['symbol_name'] == 'small_function'
|
|
assert chunks[0].metadata['symbol_kind'] == 'function'
|
|
assert 'parent_symbol_range' not in chunks[0].metadata
|
|
|
|
def test_large_symbol_splits(self):
|
|
"""Test that large symbols are recursively split."""
|
|
config = ChunkConfig(max_chunk_size=100, overlap=20)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def large_function():
|
|
# Line 1
|
|
# Line 2
|
|
# Line 3
|
|
# Line 4
|
|
# Line 5
|
|
# Line 6
|
|
# Line 7
|
|
# Line 8
|
|
# Line 9
|
|
# Line 10
|
|
# Line 11
|
|
# Line 12
|
|
# Line 13
|
|
# Line 14
|
|
# Line 15
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name='large_function', kind='function', range=(1, 18))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Should be split into multiple chunks
|
|
assert len(chunks) > 1
|
|
|
|
# All chunks should have symbol metadata
|
|
for chunk in chunks:
|
|
assert chunk.metadata['strategy'] == 'symbol_split'
|
|
assert chunk.metadata['symbol_name'] == 'large_function'
|
|
assert chunk.metadata['symbol_kind'] == 'function'
|
|
assert chunk.metadata['parent_symbol_range'] == (1, 18)
|
|
|
|
def test_boundary_condition(self):
|
|
"""Test symbol exactly at max_chunk_size boundary."""
|
|
config = ChunkConfig(max_chunk_size=90, overlap=20)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def boundary_function():
|
|
# This function is exactly at boundary
|
|
x = 1
|
|
y = 2
|
|
return x + y
|
|
'''
|
|
symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Content is slightly over 90 chars, should be split
|
|
assert len(chunks) >= 1
|
|
assert chunks[0].metadata['strategy'] == 'symbol_split'
|
|
|
|
def test_multiple_symbols_mixed_sizes(self):
|
|
"""Test chunking with multiple symbols of different sizes."""
|
|
config = ChunkConfig(max_chunk_size=150, overlap=30)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def small():
|
|
return 1
|
|
|
|
def medium():
|
|
# Medium function
|
|
x = 1
|
|
y = 2
|
|
z = 3
|
|
return x + y + z
|
|
|
|
def very_large():
|
|
# Line 1
|
|
# Line 2
|
|
# Line 3
|
|
# Line 4
|
|
# Line 5
|
|
# Line 6
|
|
# Line 7
|
|
# Line 8
|
|
# Line 9
|
|
# Line 10
|
|
# Line 11
|
|
# Line 12
|
|
# Line 13
|
|
# Line 14
|
|
# Line 15
|
|
pass
|
|
'''
|
|
symbols = [
|
|
Symbol(name='small', kind='function', range=(1, 2)),
|
|
Symbol(name='medium', kind='function', range=(4, 9)),
|
|
Symbol(name='very_large', kind='function', range=(11, 28)),
|
|
]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Find chunks for each symbol
|
|
small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small']
|
|
medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium']
|
|
large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large']
|
|
|
|
# Small should be filtered (< min_chunk_size)
|
|
assert len(small_chunks) == 0
|
|
|
|
# Medium should not be split
|
|
assert len(medium_chunks) == 1
|
|
assert medium_chunks[0].metadata['strategy'] == 'symbol'
|
|
|
|
# Large should be split
|
|
assert len(large_chunks) > 1
|
|
for chunk in large_chunks:
|
|
assert chunk.metadata['strategy'] == 'symbol_split'
|
|
|
|
def test_line_numbers_preserved(self):
|
|
"""Test that line numbers are correctly preserved in sub-chunks."""
|
|
config = ChunkConfig(max_chunk_size=100, overlap=20)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def large_function():
|
|
# Line 1 with some extra content to make it longer
|
|
# Line 2 with some extra content to make it longer
|
|
# Line 3 with some extra content to make it longer
|
|
# Line 4 with some extra content to make it longer
|
|
# Line 5 with some extra content to make it longer
|
|
# Line 6 with some extra content to make it longer
|
|
# Line 7 with some extra content to make it longer
|
|
# Line 8 with some extra content to make it longer
|
|
# Line 9 with some extra content to make it longer
|
|
# Line 10 with some extra content to make it longer
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Verify line numbers are correct and sequential
|
|
assert len(chunks) > 1
|
|
assert chunks[0].metadata['start_line'] == 1
|
|
|
|
# Each chunk should have valid line numbers
|
|
for chunk in chunks:
|
|
assert chunk.metadata['start_line'] >= 1
|
|
assert chunk.metadata['end_line'] <= 13
|
|
assert chunk.metadata['start_line'] <= chunk.metadata['end_line']
|
|
|
|
def test_overlap_in_split_chunks(self):
|
|
"""Test that overlap is applied when splitting large symbols."""
|
|
config = ChunkConfig(max_chunk_size=100, overlap=30)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def large_function():
|
|
# Line 1
|
|
# Line 2
|
|
# Line 3
|
|
# Line 4
|
|
# Line 5
|
|
# Line 6
|
|
# Line 7
|
|
# Line 8
|
|
# Line 9
|
|
# Line 10
|
|
# Line 11
|
|
# Line 12
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name='large_function', kind='function', range=(1, 14))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# With overlap, consecutive chunks should overlap
|
|
if len(chunks) > 1:
|
|
for i in range(len(chunks) - 1):
|
|
# Next chunk should start before current chunk ends (overlap)
|
|
current_end = chunks[i].metadata['end_line']
|
|
next_start = chunks[i + 1].metadata['start_line']
|
|
# Overlap should exist
|
|
assert next_start <= current_end
|
|
|
|
def test_empty_symbol_filtered(self):
|
|
"""Test that symbols smaller than min_chunk_size are filtered."""
|
|
config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def tiny():
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name='tiny', kind='function', range=(1, 2))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Should be filtered due to min_chunk_size
|
|
assert len(chunks) == 0
|
|
|
|
def test_class_symbol_splits(self):
|
|
"""Test that large class symbols are also split correctly."""
|
|
config = ChunkConfig(max_chunk_size=120, overlap=25)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''class LargeClass:
|
|
"""A large class with many methods."""
|
|
|
|
def method1(self):
|
|
return 1
|
|
|
|
def method2(self):
|
|
return 2
|
|
|
|
def method3(self):
|
|
return 3
|
|
|
|
def method4(self):
|
|
return 4
|
|
'''
|
|
symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Should be split
|
|
assert len(chunks) > 1
|
|
|
|
# All chunks should preserve class metadata
|
|
for chunk in chunks:
|
|
assert chunk.metadata['symbol_name'] == 'LargeClass'
|
|
assert chunk.metadata['symbol_kind'] == 'class'
|
|
assert chunk.metadata['strategy'] == 'symbol_split'
|
|
|
|
|
|
class TestLightweightMode:
|
|
"""Test recursive splitting with lightweight token counting."""
|
|
|
|
def test_large_symbol_splits_lightweight_mode(self):
|
|
"""Test that large symbols split correctly in lightweight mode."""
|
|
config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True)
|
|
chunker = Chunker(config)
|
|
|
|
content = '''def large_function():
|
|
# Line 1 with some extra content to make it longer
|
|
# Line 2 with some extra content to make it longer
|
|
# Line 3 with some extra content to make it longer
|
|
# Line 4 with some extra content to make it longer
|
|
# Line 5 with some extra content to make it longer
|
|
# Line 6 with some extra content to make it longer
|
|
# Line 7 with some extra content to make it longer
|
|
# Line 8 with some extra content to make it longer
|
|
# Line 9 with some extra content to make it longer
|
|
# Line 10 with some extra content to make it longer
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
|
|
|
|
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
|
|
|
# Should split even in lightweight mode
|
|
assert len(chunks) > 1
|
|
|
|
# All chunks should have token_count (estimated)
|
|
for chunk in chunks:
|
|
assert 'token_count' in chunk.metadata
|
|
assert chunk.metadata['token_count'] > 0
|