feat: Enhance configuration management and embedding capabilities

- Added JSON-based settings management in Config class for embedding and LLM configurations. - Introduced methods to save and load settings from a JSON file. - Updated BaseEmbedder and its subclasses to include max_tokens property for better token management. - Enhanced chunking strategy to support recursive splitting of large symbols with improved overlap handling. - Implemented comprehensive tests for recursive splitting and chunking behavior. - Added CLI tools configuration management for better integration with external tools. - Introduced a new command for compacting session memory into structured text for recovery.
2026-02-05 01:50:27 +08:00 · 2025-12-24 16:32:27 +08:00
parent b00113d212
commit e671b45948
25 changed files with 2889 additions and 153 deletions
--- a/codex-lens/tests/test_recursive_splitting.py
+++ b/codex-lens/tests/test_recursive_splitting.py
@@ -0,0 +1,291 @@
+"""Tests for recursive splitting of large symbols in chunker."""
+
+import pytest
+from codexlens.entities import Symbol
+from codexlens.semantic.chunker import Chunker, ChunkConfig
+
+
+class TestRecursiveSplitting:
+    """Test cases for recursive splitting of large symbols."""
+
+    def test_small_symbol_no_split(self):
+        """Test that small symbols are not split."""
+        config = ChunkConfig(max_chunk_size=1000, overlap=100)
+        chunker = Chunker(config)
+
+        content = '''def small_function():
+    # This is a small function
+    x = 1
+    y = 2
+    return x + y
+'''
+        symbols = [Symbol(name='small_function', kind='function', range=(1, 5))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        assert len(chunks) == 1
+        assert chunks[0].metadata['strategy'] == 'symbol'
+        assert chunks[0].metadata['symbol_name'] == 'small_function'
+        assert chunks[0].metadata['symbol_kind'] == 'function'
+        assert 'parent_symbol_range' not in chunks[0].metadata
+
+    def test_large_symbol_splits(self):
+        """Test that large symbols are recursively split."""
+        config = ChunkConfig(max_chunk_size=100, overlap=20)
+        chunker = Chunker(config)
+
+        content = '''def large_function():
+    # Line 1
+    # Line 2
+    # Line 3
+    # Line 4
+    # Line 5
+    # Line 6
+    # Line 7
+    # Line 8
+    # Line 9
+    # Line 10
+    # Line 11
+    # Line 12
+    # Line 13
+    # Line 14
+    # Line 15
+    pass
+'''
+        symbols = [Symbol(name='large_function', kind='function', range=(1, 18))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Should be split into multiple chunks
+        assert len(chunks) > 1
+
+        # All chunks should have symbol metadata
+        for chunk in chunks:
+            assert chunk.metadata['strategy'] == 'symbol_split'
+            assert chunk.metadata['symbol_name'] == 'large_function'
+            assert chunk.metadata['symbol_kind'] == 'function'
+            assert chunk.metadata['parent_symbol_range'] == (1, 18)
+
+    def test_boundary_condition(self):
+        """Test symbol exactly at max_chunk_size boundary."""
+        config = ChunkConfig(max_chunk_size=90, overlap=20)
+        chunker = Chunker(config)
+
+        content = '''def boundary_function():
+    # This function is exactly at boundary
+    x = 1
+    y = 2
+    return x + y
+'''
+        symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Content is slightly over 90 chars, should be split
+        assert len(chunks) >= 1
+        assert chunks[0].metadata['strategy'] == 'symbol_split'
+
+    def test_multiple_symbols_mixed_sizes(self):
+        """Test chunking with multiple symbols of different sizes."""
+        config = ChunkConfig(max_chunk_size=150, overlap=30)
+        chunker = Chunker(config)
+
+        content = '''def small():
+    return 1
+
+def medium():
+    # Medium function
+    x = 1
+    y = 2
+    z = 3
+    return x + y + z
+
+def very_large():
+    # Line 1
+    # Line 2
+    # Line 3
+    # Line 4
+    # Line 5
+    # Line 6
+    # Line 7
+    # Line 8
+    # Line 9
+    # Line 10
+    # Line 11
+    # Line 12
+    # Line 13
+    # Line 14
+    # Line 15
+    pass
+'''
+        symbols = [
+            Symbol(name='small', kind='function', range=(1, 2)),
+            Symbol(name='medium', kind='function', range=(4, 9)),
+            Symbol(name='very_large', kind='function', range=(11, 28)),
+        ]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Find chunks for each symbol
+        small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small']
+        medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium']
+        large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large']
+
+        # Small should be filtered (< min_chunk_size)
+        assert len(small_chunks) == 0
+
+        # Medium should not be split
+        assert len(medium_chunks) == 1
+        assert medium_chunks[0].metadata['strategy'] == 'symbol'
+
+        # Large should be split
+        assert len(large_chunks) > 1
+        for chunk in large_chunks:
+            assert chunk.metadata['strategy'] == 'symbol_split'
+
+    def test_line_numbers_preserved(self):
+        """Test that line numbers are correctly preserved in sub-chunks."""
+        config = ChunkConfig(max_chunk_size=100, overlap=20)
+        chunker = Chunker(config)
+
+        content = '''def large_function():
+    # Line 1 with some extra content to make it longer
+    # Line 2 with some extra content to make it longer
+    # Line 3 with some extra content to make it longer
+    # Line 4 with some extra content to make it longer
+    # Line 5 with some extra content to make it longer
+    # Line 6 with some extra content to make it longer
+    # Line 7 with some extra content to make it longer
+    # Line 8 with some extra content to make it longer
+    # Line 9 with some extra content to make it longer
+    # Line 10 with some extra content to make it longer
+    pass
+'''
+        symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Verify line numbers are correct and sequential
+        assert len(chunks) > 1
+        assert chunks[0].metadata['start_line'] == 1
+
+        # Each chunk should have valid line numbers
+        for chunk in chunks:
+            assert chunk.metadata['start_line'] >= 1
+            assert chunk.metadata['end_line'] <= 13
+            assert chunk.metadata['start_line'] <= chunk.metadata['end_line']
+
+    def test_overlap_in_split_chunks(self):
+        """Test that overlap is applied when splitting large symbols."""
+        config = ChunkConfig(max_chunk_size=100, overlap=30)
+        chunker = Chunker(config)
+
+        content = '''def large_function():
+    # Line 1
+    # Line 2
+    # Line 3
+    # Line 4
+    # Line 5
+    # Line 6
+    # Line 7
+    # Line 8
+    # Line 9
+    # Line 10
+    # Line 11
+    # Line 12
+    pass
+'''
+        symbols = [Symbol(name='large_function', kind='function', range=(1, 14))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # With overlap, consecutive chunks should overlap
+        if len(chunks) > 1:
+            for i in range(len(chunks) - 1):
+                # Next chunk should start before current chunk ends (overlap)
+                current_end = chunks[i].metadata['end_line']
+                next_start = chunks[i + 1].metadata['start_line']
+                # Overlap should exist
+                assert next_start <= current_end
+
+    def test_empty_symbol_filtered(self):
+        """Test that symbols smaller than min_chunk_size are filtered."""
+        config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50)
+        chunker = Chunker(config)
+
+        content = '''def tiny():
+    pass
+'''
+        symbols = [Symbol(name='tiny', kind='function', range=(1, 2))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Should be filtered due to min_chunk_size
+        assert len(chunks) == 0
+
+    def test_class_symbol_splits(self):
+        """Test that large class symbols are also split correctly."""
+        config = ChunkConfig(max_chunk_size=120, overlap=25)
+        chunker = Chunker(config)
+
+        content = '''class LargeClass:
+    """A large class with many methods."""
+
+    def method1(self):
+        return 1
+
+    def method2(self):
+        return 2
+
+    def method3(self):
+        return 3
+
+    def method4(self):
+        return 4
+'''
+        symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Should be split
+        assert len(chunks) > 1
+
+        # All chunks should preserve class metadata
+        for chunk in chunks:
+            assert chunk.metadata['symbol_name'] == 'LargeClass'
+            assert chunk.metadata['symbol_kind'] == 'class'
+            assert chunk.metadata['strategy'] == 'symbol_split'
+
+
+class TestLightweightMode:
+    """Test recursive splitting with lightweight token counting."""
+
+    def test_large_symbol_splits_lightweight_mode(self):
+        """Test that large symbols split correctly in lightweight mode."""
+        config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True)
+        chunker = Chunker(config)
+
+        content = '''def large_function():
+    # Line 1 with some extra content to make it longer
+    # Line 2 with some extra content to make it longer
+    # Line 3 with some extra content to make it longer
+    # Line 4 with some extra content to make it longer
+    # Line 5 with some extra content to make it longer
+    # Line 6 with some extra content to make it longer
+    # Line 7 with some extra content to make it longer
+    # Line 8 with some extra content to make it longer
+    # Line 9 with some extra content to make it longer
+    # Line 10 with some extra content to make it longer
+    pass
+'''
+        symbols = [Symbol(name='large_function', kind='function', range=(1, 13))]
+
+        chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python')
+
+        # Should split even in lightweight mode
+        assert len(chunks) > 1
+
+        # All chunks should have token_count (estimated)
+        for chunk in chunks:
+            assert 'token_count' in chunk.metadata
+            assert chunk.metadata['token_count'] > 0