feat: 优化动态批量大小计算，确保使用所有解析规则的最大字符限制，并调整利用率因子的安全范围

2026-02-05 01:50:27 +08:00 · 2026-01-12 17:47:19 +08:00
parent 90a1321aac
commit 57173c9b02
1 changed files with 23 additions and 7 deletions
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -63,10 +63,17 @@ def calculate_dynamic_batch_size(config, embedder) -> int:
    if not getattr(config, 'api_batch_size_dynamic', False):
        return getattr(config, 'api_batch_size', 8)

-    # Get maximum chunk character size from parsing rules
+    # Get maximum chunk character size from ALL parsing rules (not just default)
+    # This ensures we use the worst-case chunk size across all languages
    parsing_rules = getattr(config, 'parsing_rules', {})
-    default_rules = parsing_rules.get('default', {})
-    max_chunk_chars = default_rules.get('max_chunk_chars', 4000)
+    all_max_chunk_chars = [
+        rule.get('max_chunk_chars', 0)
+        for rule in parsing_rules.values()
+        if isinstance(rule, dict)
+    ]
+    max_chunk_chars = max(all_max_chunk_chars) if all_max_chunk_chars else 4000
+    if max_chunk_chars <= 0:
+        max_chunk_chars = 4000  # Final fallback

    # Get characters per token estimate
    chars_per_token = getattr(config, 'chars_per_token_estimate', 4)
@@ -83,10 +90,19 @@ def calculate_dynamic_batch_size(config, embedder) -> int:
    # Get model's maximum token capacity
    model_max_tokens = getattr(embedder, 'max_tokens', 8192)

-    # Get utilization factor (default 80%)
+    # Get utilization factor (default 80%, max 95% to leave safety margin)
    utilization_factor = getattr(config, 'api_batch_size_utilization_factor', 0.8)
-    if utilization_factor <= 0 or utilization_factor > 1:
-        utilization_factor = 0.8
+    if utilization_factor <= 0 or utilization_factor > 0.95:
+        if utilization_factor > 0.95:
+            logger.warning(
+                "Utilization factor %.2f exceeds safe limit 0.95. "
+                "Token estimation is approximate, high values risk API errors. "
+                "Clamping to 0.95.",
+                utilization_factor
+            )
+            utilization_factor = 0.95
+        else:
+            utilization_factor = 0.8

    # Calculate safe token limit
    safe_token_limit = model_max_tokens * utilization_factor
@@ -1378,7 +1394,7 @@ def generate_dense_embeddings_centralized(

                        # Generate embeddings for this file's chunks
                        batch_contents = [chunk.content for chunk in chunks]
-                        embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
+                        embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size)

                        # Assign chunk IDs and store embeddings
                        for i, chunk in enumerate(chunks):