feat: Enhance configuration management and embedding capabilities

- Added JSON-based settings management in Config class for embedding and LLM configurations. - Introduced methods to save and load settings from a JSON file. - Updated BaseEmbedder and its subclasses to include max_tokens property for better token management. - Enhanced chunking strategy to support recursive splitting of large symbols with improved overlap handling. - Implemented comprehensive tests for recursive splitting and chunking behavior. - Added CLI tools configuration management for better integration with external tools. - Introduced a new command for compacting session memory into structured text for recovery.
2026-02-10 02:24:35 +08:00 · 2025-12-24 16:32:27 +08:00
parent b00113d212
commit e671b45948
25 changed files with 2889 additions and 153 deletions
--- a/codex-lens/src/codexlens/semantic/base.py
+++ b/codex-lens/src/codexlens/semantic/base.py
@@ -38,6 +38,16 @@ class BaseEmbedder(ABC):
        """
        ...

+    @property
+    def max_tokens(self) -> int:
+        """Return maximum token limit for embeddings.
+
+        Returns:
+            int: Maximum number of tokens that can be embedded at once.
+                Default is 8192 if not overridden by implementation.
+        """
+        return 8192
+
    @abstractmethod
    def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
        """Embed texts to numpy array.
--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -39,7 +39,7 @@ from codexlens.parsers.tokenizer import get_default_tokenizer
 class ChunkConfig:
    """Configuration for chunking strategies."""
    max_chunk_size: int = 1000  # Max characters per chunk
-    overlap: int = 100  # Overlap for sliding window
+    overlap: int = 200  # Overlap for sliding window (increased from 100 for better context)
    strategy: str = "auto"  # Chunking strategy: auto, symbol, sliding_window, hybrid
    min_chunk_size: int = 50  # Minimum chunk size
    skip_token_count: bool = False  # Skip expensive token counting (use char/4 estimate)
@@ -80,6 +80,7 @@ class Chunker:
        """Chunk code by extracted symbols (functions, classes).

        Each symbol becomes one chunk with its full content.
+        Large symbols exceeding max_chunk_size are recursively split using sliding window.

        Args:
            content: Source code content
@@ -101,27 +102,49 @@ class Chunker:
            if len(chunk_content.strip()) < self.config.min_chunk_size:
                continue

-            # Calculate token count if not provided
-            token_count = None
-            if symbol_token_counts and symbol.name in symbol_token_counts:
-                token_count = symbol_token_counts[symbol.name]
-            else:
-                token_count = self._estimate_token_count(chunk_content)
+            # Check if symbol content exceeds max_chunk_size
+            if len(chunk_content) > self.config.max_chunk_size:
+                # Create line mapping for correct line number tracking
+                line_mapping = list(range(start_line, end_line + 1))

-            chunks.append(SemanticChunk(
-                content=chunk_content,
-                embedding=None,
-                metadata={
-                    "file": str(file_path),
-                    "language": language,
-                    "symbol_name": symbol.name,
-                    "symbol_kind": symbol.kind,
-                    "start_line": start_line,
-                    "end_line": end_line,
-                    "strategy": "symbol",
-                    "token_count": token_count,
-                }
-            ))
+                # Use sliding window to split large symbol
+                sub_chunks = self.chunk_sliding_window(
+                    chunk_content,
+                    file_path=file_path,
+                    language=language,
+                    line_mapping=line_mapping
+                )
+
+                # Update sub_chunks with parent symbol metadata
+                for sub_chunk in sub_chunks:
+                    sub_chunk.metadata["symbol_name"] = symbol.name
+                    sub_chunk.metadata["symbol_kind"] = symbol.kind
+                    sub_chunk.metadata["strategy"] = "symbol_split"
+                    sub_chunk.metadata["parent_symbol_range"] = (start_line, end_line)
+
+                chunks.extend(sub_chunks)
+            else:
+                # Calculate token count if not provided
+                token_count = None
+                if symbol_token_counts and symbol.name in symbol_token_counts:
+                    token_count = symbol_token_counts[symbol.name]
+                else:
+                    token_count = self._estimate_token_count(chunk_content)
+
+                chunks.append(SemanticChunk(
+                    content=chunk_content,
+                    embedding=None,
+                    metadata={
+                        "file": str(file_path),
+                        "language": language,
+                        "symbol_name": symbol.name,
+                        "symbol_kind": symbol.kind,
+                        "start_line": start_line,
+                        "end_line": end_line,
+                        "strategy": "symbol",
+                        "token_count": token_count,
+                    }
+                ))

        return chunks

--- a/codex-lens/src/codexlens/semantic/embedder.py
+++ b/codex-lens/src/codexlens/semantic/embedder.py
@@ -165,6 +165,33 @@ class Embedder(BaseEmbedder):
        """Get embedding dimension for current model."""
        return self.MODEL_DIMS.get(self._model_name, 768)  # Default to 768 if unknown

+    @property
+    def max_tokens(self) -> int:
+        """Get maximum token limit for current model.
+
+        Returns:
+            int: Maximum number of tokens based on model profile.
+                - fast: 512 (lightweight, optimized for speed)
+                - code: 8192 (code-optimized, larger context)
+                - multilingual: 512 (standard multilingual model)
+                - balanced: 512 (general purpose)
+        """
+        # Determine profile from model name
+        profile = None
+        for prof, model in self.MODELS.items():
+            if model == self._model_name:
+                profile = prof
+                break
+
+        # Return token limit based on profile
+        if profile == "code":
+            return 8192
+        elif profile in ("fast", "multilingual", "balanced"):
+            return 512
+        else:
+            # Default for unknown models
+            return 512
+
    @property
    def providers(self) -> List[str]:
        """Get configured ONNX execution providers."""
--- a/codex-lens/src/codexlens/semantic/litellm_embedder.py
+++ b/codex-lens/src/codexlens/semantic/litellm_embedder.py
@@ -63,11 +63,39 @@ class LiteLLMEmbedderWrapper(BaseEmbedder):
        """
        return self._embedder.model_name

-    def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
+    @property
+    def max_tokens(self) -> int:
+        """Return maximum token limit for the embedding model.
+
+        Returns:
+            int: Maximum number of tokens that can be embedded at once.
+                Inferred from model config or model name patterns.
+        """
+        # Try to get from LiteLLM config first
+        if hasattr(self._embedder, 'max_input_tokens') and self._embedder.max_input_tokens:
+            return self._embedder.max_input_tokens
+
+        # Infer from model name
+        model_name_lower = self.model_name.lower()
+
+        # Large models (8B or "large" in name)
+        if '8b' in model_name_lower or 'large' in model_name_lower:
+            return 32768
+
+        # OpenAI text-embedding-3-* models
+        if 'text-embedding-3' in model_name_lower:
+            return 8191
+
+        # Default fallback
+        return 8192
+
+    def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:
        """Embed texts to numpy array using LiteLLMEmbedder.

        Args:
            texts: Single text or iterable of texts to embed.
+            **kwargs: Additional arguments (ignored for LiteLLM backend).
+                      Accepts batch_size for API compatibility with fastembed.

        Returns:
            numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.
@@ -76,4 +104,5 @@ class LiteLLMEmbedderWrapper(BaseEmbedder):
            texts = [texts]
        else:
            texts = list(texts)
+        # LiteLLM handles batching internally, ignore batch_size parameter
        return self._embedder.embed(texts)