mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: Enhance configuration management and embedding capabilities
- Added JSON-based settings management in Config class for embedding and LLM configurations. - Introduced methods to save and load settings from a JSON file. - Updated BaseEmbedder and its subclasses to include max_tokens property for better token management. - Enhanced chunking strategy to support recursive splitting of large symbols with improved overlap handling. - Implemented comprehensive tests for recursive splitting and chunking behavior. - Added CLI tools configuration management for better integration with external tools. - Introduced a new command for compacting session memory into structured text for recovery.
This commit is contained in:
291
codex-lens/tests/test_recursive_splitting.py
Normal file
291
codex-lens/tests/test_recursive_splitting.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Tests for recursive splitting of large symbols in chunker."""
|
||||
|
||||
import pytest
|
||||
from codexlens.entities import Symbol
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
|
||||
|
||||
class TestRecursiveSplitting:
|
||||
"""Test cases for recursive splitting of large symbols."""
|
||||
|
||||
def test_small_symbol_no_split(self):
|
||||
"""Test that small symbols are not split."""
|
||||
config = ChunkConfig(max_chunk_size=1000, overlap=100)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def small_function():
|
||||
# This is a small function
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
'''
|
||||
symbols = [Symbol(name='small_function', kind='function', range=(1, 5))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].metadata['strategy'] == 'symbol'
|
||||
assert chunks[0].metadata['symbol_name'] == 'small_function'
|
||||
assert chunks[0].metadata['symbol_kind'] == 'function'
|
||||
assert 'parent_symbol_range' not in chunks[0].metadata
|
||||
|
||||
def test_large_symbol_splits(self):
|
||||
"""Test that large symbols are recursively split."""
|
||||
config = ChunkConfig(max_chunk_size=100, overlap=20)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def large_function():
|
||||
# Line 1
|
||||
# Line 2
|
||||
# Line 3
|
||||
# Line 4
|
||||
# Line 5
|
||||
# Line 6
|
||||
# Line 7
|
||||
# Line 8
|
||||
# Line 9
|
||||
# Line 10
|
||||
# Line 11
|
||||
# Line 12
|
||||
# Line 13
|
||||
# Line 14
|
||||
# Line 15
|
||||
pass
|
||||
'''
|
||||
symbols = [Symbol(name='large_function', kind='function', range=(1, 18))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Should be split into multiple chunks
|
||||
assert len(chunks) > 1
|
||||
|
||||
# All chunks should have symbol metadata
|
||||
for chunk in chunks:
|
||||
assert chunk.metadata['strategy'] == 'symbol_split'
|
||||
assert chunk.metadata['symbol_name'] == 'large_function'
|
||||
assert chunk.metadata['symbol_kind'] == 'function'
|
||||
assert chunk.metadata['parent_symbol_range'] == (1, 18)
|
||||
|
||||
def test_boundary_condition(self):
|
||||
"""Test symbol exactly at max_chunk_size boundary."""
|
||||
config = ChunkConfig(max_chunk_size=90, overlap=20)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def boundary_function():
|
||||
# This function is exactly at boundary
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
'''
|
||||
symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Content is slightly over 90 chars, should be split
|
||||
assert len(chunks) >= 1
|
||||
assert chunks[0].metadata['strategy'] == 'symbol_split'
|
||||
|
||||
def test_multiple_symbols_mixed_sizes(self):
|
||||
"""Test chunking with multiple symbols of different sizes."""
|
||||
config = ChunkConfig(max_chunk_size=150, overlap=30)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def small():
|
||||
return 1
|
||||
|
||||
def medium():
|
||||
# Medium function
|
||||
x = 1
|
||||
y = 2
|
||||
z = 3
|
||||
return x + y + z
|
||||
|
||||
def very_large():
|
||||
# Line 1
|
||||
# Line 2
|
||||
# Line 3
|
||||
# Line 4
|
||||
# Line 5
|
||||
# Line 6
|
||||
# Line 7
|
||||
# Line 8
|
||||
# Line 9
|
||||
# Line 10
|
||||
# Line 11
|
||||
# Line 12
|
||||
# Line 13
|
||||
# Line 14
|
||||
# Line 15
|
||||
pass
|
||||
'''
|
||||
symbols = [
|
||||
Symbol(name='small', kind='function', range=(1, 2)),
|
||||
Symbol(name='medium', kind='function', range=(4, 9)),
|
||||
Symbol(name='very_large', kind='function', range=(11, 28)),
|
||||
]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Find chunks for each symbol
|
||||
small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small']
|
||||
medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium']
|
||||
large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large']
|
||||
|
||||
# Small should be filtered (< min_chunk_size)
|
||||
assert len(small_chunks) == 0
|
||||
|
||||
# Medium should not be split
|
||||
assert len(medium_chunks) == 1
|
||||
assert medium_chunks[0].metadata['strategy'] == 'symbol'
|
||||
|
||||
# Large should be split
|
||||
assert len(large_chunks) > 1
|
||||
for chunk in large_chunks:
|
||||
assert chunk.metadata['strategy'] == 'symbol_split'
|
||||
|
||||
def test_line_numbers_preserved(self):
|
||||
"""Test that line numbers are correctly preserved in sub-chunks."""
|
||||
config = ChunkConfig(max_chunk_size=100, overlap=20)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def large_function():
|
||||
# Line 1 with some extra content to make it longer
|
||||
# Line 2 with some extra content to make it longer
|
||||
# Line 3 with some extra content to make it longer
|
||||
# Line 4 with some extra content to make it longer
|
||||
# Line 5 with some extra content to make it longer
|
||||
# Line 6 with some extra content to make it longer
|
||||
# Line 7 with some extra content to make it longer
|
||||
# Line 8 with some extra content to make it longer
|
||||
# Line 9 with some extra content to make it longer
|
||||
# Line 10 with some extra content to make it longer
|
||||
pass
|
||||
'''
|
||||
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Verify line numbers are correct and sequential
|
||||
assert len(chunks) > 1
|
||||
assert chunks[0].metadata['start_line'] == 1
|
||||
|
||||
# Each chunk should have valid line numbers
|
||||
for chunk in chunks:
|
||||
assert chunk.metadata['start_line'] >= 1
|
||||
assert chunk.metadata['end_line'] <= 13
|
||||
assert chunk.metadata['start_line'] <= chunk.metadata['end_line']
|
||||
|
||||
def test_overlap_in_split_chunks(self):
|
||||
"""Test that overlap is applied when splitting large symbols."""
|
||||
config = ChunkConfig(max_chunk_size=100, overlap=30)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def large_function():
|
||||
# Line 1
|
||||
# Line 2
|
||||
# Line 3
|
||||
# Line 4
|
||||
# Line 5
|
||||
# Line 6
|
||||
# Line 7
|
||||
# Line 8
|
||||
# Line 9
|
||||
# Line 10
|
||||
# Line 11
|
||||
# Line 12
|
||||
pass
|
||||
'''
|
||||
symbols = [Symbol(name='large_function', kind='function', range=(1, 14))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# With overlap, consecutive chunks should overlap
|
||||
if len(chunks) > 1:
|
||||
for i in range(len(chunks) - 1):
|
||||
# Next chunk should start before current chunk ends (overlap)
|
||||
current_end = chunks[i].metadata['end_line']
|
||||
next_start = chunks[i + 1].metadata['start_line']
|
||||
# Overlap should exist
|
||||
assert next_start <= current_end
|
||||
|
||||
def test_empty_symbol_filtered(self):
|
||||
"""Test that symbols smaller than min_chunk_size are filtered."""
|
||||
config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def tiny():
|
||||
pass
|
||||
'''
|
||||
symbols = [Symbol(name='tiny', kind='function', range=(1, 2))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Should be filtered due to min_chunk_size
|
||||
assert len(chunks) == 0
|
||||
|
||||
def test_class_symbol_splits(self):
|
||||
"""Test that large class symbols are also split correctly."""
|
||||
config = ChunkConfig(max_chunk_size=120, overlap=25)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''class LargeClass:
|
||||
"""A large class with many methods."""
|
||||
|
||||
def method1(self):
|
||||
return 1
|
||||
|
||||
def method2(self):
|
||||
return 2
|
||||
|
||||
def method3(self):
|
||||
return 3
|
||||
|
||||
def method4(self):
|
||||
return 4
|
||||
'''
|
||||
symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Should be split
|
||||
assert len(chunks) > 1
|
||||
|
||||
# All chunks should preserve class metadata
|
||||
for chunk in chunks:
|
||||
assert chunk.metadata['symbol_name'] == 'LargeClass'
|
||||
assert chunk.metadata['symbol_kind'] == 'class'
|
||||
assert chunk.metadata['strategy'] == 'symbol_split'
|
||||
|
||||
|
||||
class TestLightweightMode:
|
||||
"""Test recursive splitting with lightweight token counting."""
|
||||
|
||||
def test_large_symbol_splits_lightweight_mode(self):
|
||||
"""Test that large symbols split correctly in lightweight mode."""
|
||||
config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True)
|
||||
chunker = Chunker(config)
|
||||
|
||||
content = '''def large_function():
|
||||
# Line 1 with some extra content to make it longer
|
||||
# Line 2 with some extra content to make it longer
|
||||
# Line 3 with some extra content to make it longer
|
||||
# Line 4 with some extra content to make it longer
|
||||
# Line 5 with some extra content to make it longer
|
||||
# Line 6 with some extra content to make it longer
|
||||
# Line 7 with some extra content to make it longer
|
||||
# Line 8 with some extra content to make it longer
|
||||
# Line 9 with some extra content to make it longer
|
||||
# Line 10 with some extra content to make it longer
|
||||
pass
|
||||
'''
|
||||
symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
|
||||
|
||||
chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
|
||||
|
||||
# Should split even in lightweight mode
|
||||
assert len(chunks) > 1
|
||||
|
||||
# All chunks should have token_count (estimated)
|
||||
for chunk in chunks:
|
||||
assert 'token_count' in chunk.metadata
|
||||
assert chunk.metadata['token_count'] > 0
|
||||
Reference in New Issue
Block a user