feat: Enhance LiteLLM integration and CLI management

- Added token estimation and batching functionality in LiteLLMEmbedder to handle large text inputs efficiently. - Updated embed method to support max_tokens_per_batch parameter for better API call management. - Introduced new API routes for managing custom CLI endpoints, including GET, POST, PUT, and DELETE methods. - Enhanced CLI history component to support source directory context for native session content. - Improved error handling and logging in various components for better debugging and user feedback. - Added internationalization support for new API endpoint features in the i18n module. - Updated CodexLens CLI commands to allow for concurrent API calls with a max_workers option. - Enhanced embedding manager to track model information and handle embeddings generation more robustly. - Added entry points for CLI commands in the package configuration.
2026-02-05 01:50:27 +08:00 · 2025-12-24 18:01:26 +08:00
parent dfca4d60ee
commit e3e61bcae9
13 changed files with 575 additions and 107 deletions
--- a/ccw-litellm/src/ccw_litellm.egg-info/SOURCES.txt
+++ b/ccw-litellm/src/ccw_litellm.egg-info/SOURCES.txt
@@ -1,8 +1,11 @@
+README.md
 pyproject.toml
 src/ccw_litellm/__init__.py
+src/ccw_litellm/cli.py
 src/ccw_litellm.egg-info/PKG-INFO
 src/ccw_litellm.egg-info/SOURCES.txt
 src/ccw_litellm.egg-info/dependency_links.txt
+src/ccw_litellm.egg-info/entry_points.txt
 src/ccw_litellm.egg-info/requires.txt
 src/ccw_litellm.egg-info/top_level.txt
 src/ccw_litellm/clients/__init__.py
--- a/ccw-litellm/src/ccw_litellm.egg-info/entry_points.txt
+++ b/ccw-litellm/src/ccw_litellm.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+ccw-litellm = ccw_litellm.cli:main
--- a/ccw-litellm/src/ccw_litellm/clients/litellm_embedder.py
+++ b/ccw-litellm/src/ccw_litellm/clients/litellm_embedder.py
@@ -102,18 +102,75 @@ class LiteLLMEmbedder(AbstractEmbedder):
        """Embedding vector size."""
        return self._model_config.dimensions

+    def _estimate_tokens(self, text: str) -> int:
+        """Estimate token count for a text using fast heuristic.
+
+        Args:
+            text: Text to estimate tokens for
+
+        Returns:
+            Estimated token count (len/4 is a reasonable approximation)
+        """
+        return len(text) // 4
+
+    def _create_batches(
+        self,
+        texts: list[str],
+        max_tokens: int = 30000
+    ) -> list[list[str]]:
+        """Split texts into batches that fit within token limits.
+
+        Args:
+            texts: List of texts to batch
+            max_tokens: Maximum tokens per batch (default: 30000, safe margin for 40960 limit)
+
+        Returns:
+            List of text batches
+        """
+        batches = []
+        current_batch = []
+        current_tokens = 0
+
+        for text in texts:
+            text_tokens = self._estimate_tokens(text)
+
+            # If single text exceeds limit, truncate it
+            if text_tokens > max_tokens:
+                logger.warning(f"Text with {text_tokens} estimated tokens exceeds limit, truncating")
+                # Truncate to fit (rough estimate: 4 chars per token)
+                max_chars = max_tokens * 4
+                text = text[:max_chars]
+                text_tokens = self._estimate_tokens(text)
+
+            # Start new batch if current would exceed limit
+            if current_tokens + text_tokens > max_tokens and current_batch:
+                batches.append(current_batch)
+                current_batch = []
+                current_tokens = 0
+
+            current_batch.append(text)
+            current_tokens += text_tokens
+
+        # Add final batch
+        if current_batch:
+            batches.append(current_batch)
+
+        return batches
+
    def embed(
        self,
        texts: str | Sequence[str],
        *,
        batch_size: int | None = None,
+        max_tokens_per_batch: int = 30000,
        **kwargs: Any,
    ) -> NDArray[np.floating]:
        """Embed one or more texts.

        Args:
            texts: Single text or sequence of texts
-            batch_size: Batch size for processing (currently unused, LiteLLM handles batching)
+            batch_size: Batch size for processing (deprecated, use max_tokens_per_batch)
+            max_tokens_per_batch: Maximum estimated tokens per API call (default: 30000)
            **kwargs: Additional arguments for litellm.embedding()

        Returns:
@@ -125,10 +182,8 @@ class LiteLLMEmbedder(AbstractEmbedder):
        # Normalize input to list
        if isinstance(texts, str):
            text_list = [texts]
-            single_input = True
        else:
            text_list = list(texts)
-            single_input = False

        if not text_list:
            # Return empty array with correct shape
@@ -137,36 +192,53 @@ class LiteLLMEmbedder(AbstractEmbedder):
        # Merge kwargs
        embedding_kwargs = {**self._litellm_kwargs, **kwargs}

-        try:
-            # For OpenAI-compatible endpoints, ensure encoding_format is set
-            if self._provider_config.api_base and "encoding_format" not in embedding_kwargs:
-                embedding_kwargs["encoding_format"] = "float"
+        # For OpenAI-compatible endpoints, ensure encoding_format is set
+        if self._provider_config.api_base and "encoding_format" not in embedding_kwargs:
+            embedding_kwargs["encoding_format"] = "float"

-            # Call LiteLLM embedding
-            response = litellm.embedding(
-                model=self._format_model_name(),
-                input=text_list,
-                **embedding_kwargs,
-            )
+        # Split into token-aware batches
+        batches = self._create_batches(text_list, max_tokens_per_batch)

-            # Extract embeddings
-            embeddings = [item["embedding"] for item in response.data]
+        if len(batches) > 1:
+            logger.info(f"Split {len(text_list)} texts into {len(batches)} batches for embedding")

-            # Convert to numpy array
-            result = np.array(embeddings, dtype=np.float32)
+        all_embeddings = []

-            # Validate dimensions
-            if result.shape[1] != self.dimensions:
-                logger.warning(
-                    f"Expected {self.dimensions} dimensions, got {result.shape[1]}. "
-                    f"Configuration may be incorrect."
+        for batch_idx, batch in enumerate(batches):
+            try:
+                # Build call kwargs with explicit api_base
+                call_kwargs = {**embedding_kwargs}
+                if self._provider_config.api_base:
+                    call_kwargs["api_base"] = self._provider_config.api_base
+                if self._provider_config.api_key:
+                    call_kwargs["api_key"] = self._provider_config.api_key
+
+                # Call LiteLLM embedding for this batch
+                response = litellm.embedding(
+                    model=self._format_model_name(),
+                    input=batch,
+                    **call_kwargs,
                )

-            return result
+                # Extract embeddings
+                batch_embeddings = [item["embedding"] for item in response.data]
+                all_embeddings.extend(batch_embeddings)

-        except Exception as e:
-            logger.error(f"LiteLLM embedding failed: {e}")
-            raise
+            except Exception as e:
+                logger.error(f"LiteLLM embedding failed for batch {batch_idx + 1}/{len(batches)}: {e}")
+                raise
+
+        # Convert to numpy array
+        result = np.array(all_embeddings, dtype=np.float32)
+
+        # Validate dimensions
+        if result.shape[1] != self.dimensions:
+            logger.warning(
+                f"Expected {self.dimensions} dimensions, got {result.shape[1]}. "
+                f"Configuration may be incorrect."
+            )
+
+        return result

    @property
    def model_name(self) -> str: