From 57173c9b023ca19c775fa732b332cf346b5ce5aa Mon Sep 17 00:00:00 2001
From: catlog22 <catlog22@github.com>
Date: Mon, 12 Jan 2026 17:47:19 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96=E5=8A=A8=E6=80=81?=
 =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=A4=A7=E5=B0=8F=E8=AE=A1=E7=AE=97=EF=BC=8C?=
 =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E4=BD=BF=E7=94=A8=E6=89=80=E6=9C=89=E8=A7=A3?=
 =?UTF-8?q?=E6=9E=90=E8=A7=84=E5=88=99=E7=9A=84=E6=9C=80=E5=A4=A7=E5=AD=97?=
 =?UTF-8?q?=E7=AC=A6=E9=99=90=E5=88=B6=EF=BC=8C=E5=B9=B6=E8=B0=83=E6=95=B4?=
 =?UTF-8?q?=E5=88=A9=E7=94=A8=E7=8E=87=E5=9B=A0=E5=AD=90=E7=9A=84=E5=AE=89?=
 =?UTF-8?q?=E5=85=A8=E8=8C=83=E5=9B=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/codexlens/cli/embedding_manager.py    | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py
index 2b22b026..7667bf1c 100644
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -63,10 +63,17 @@ def calculate_dynamic_batch_size(config, embedder) -> int:
     if not getattr(config, 'api_batch_size_dynamic', False):
         return getattr(config, 'api_batch_size', 8)
 
-    # Get maximum chunk character size from parsing rules
+    # Get maximum chunk character size from ALL parsing rules (not just default)
+    # This ensures we use the worst-case chunk size across all languages
     parsing_rules = getattr(config, 'parsing_rules', {})
-    default_rules = parsing_rules.get('default', {})
-    max_chunk_chars = default_rules.get('max_chunk_chars', 4000)
+    all_max_chunk_chars = [
+        rule.get('max_chunk_chars', 0)
+        for rule in parsing_rules.values()
+        if isinstance(rule, dict)
+    ]
+    max_chunk_chars = max(all_max_chunk_chars) if all_max_chunk_chars else 4000
+    if max_chunk_chars <= 0:
+        max_chunk_chars = 4000  # Final fallback
 
     # Get characters per token estimate
     chars_per_token = getattr(config, 'chars_per_token_estimate', 4)
@@ -83,10 +90,19 @@ def calculate_dynamic_batch_size(config, embedder) -> int:
     # Get model's maximum token capacity
     model_max_tokens = getattr(embedder, 'max_tokens', 8192)
 
-    # Get utilization factor (default 80%)
+    # Get utilization factor (default 80%, max 95% to leave safety margin)
     utilization_factor = getattr(config, 'api_batch_size_utilization_factor', 0.8)
-    if utilization_factor <= 0 or utilization_factor > 1:
-        utilization_factor = 0.8
+    if utilization_factor <= 0 or utilization_factor > 0.95:
+        if utilization_factor > 0.95:
+            logger.warning(
+                "Utilization factor %.2f exceeds safe limit 0.95. "
+                "Token estimation is approximate, high values risk API errors. "
+                "Clamping to 0.95.",
+                utilization_factor
+            )
+            utilization_factor = 0.95
+        else:
+            utilization_factor = 0.8
 
     # Calculate safe token limit
     safe_token_limit = model_max_tokens * utilization_factor
@@ -1378,7 +1394,7 @@ def generate_dense_embeddings_centralized(
 
                         # Generate embeddings for this file's chunks
                         batch_contents = [chunk.content for chunk in chunks]
-                        embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
+                        embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size)
 
                         # Assign chunk IDs and store embeddings
                         for i, chunk in enumerate(chunks):