From 57173c9b023ca19c775fa732b332cf346b5ce5aa Mon Sep 17 00:00:00 2001 From: catlog22 Date: Mon, 12 Jan 2026 17:47:19 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96=E5=8A=A8=E6=80=81?= =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=A4=A7=E5=B0=8F=E8=AE=A1=E7=AE=97=EF=BC=8C?= =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E4=BD=BF=E7=94=A8=E6=89=80=E6=9C=89=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E8=A7=84=E5=88=99=E7=9A=84=E6=9C=80=E5=A4=A7=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=E9=99=90=E5=88=B6=EF=BC=8C=E5=B9=B6=E8=B0=83=E6=95=B4?= =?UTF-8?q?=E5=88=A9=E7=94=A8=E7=8E=87=E5=9B=A0=E5=AD=90=E7=9A=84=E5=AE=89?= =?UTF-8?q?=E5=85=A8=E8=8C=83=E5=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/codexlens/cli/embedding_manager.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py index 2b22b026..7667bf1c 100644 --- a/codex-lens/src/codexlens/cli/embedding_manager.py +++ b/codex-lens/src/codexlens/cli/embedding_manager.py @@ -63,10 +63,17 @@ def calculate_dynamic_batch_size(config, embedder) -> int: if not getattr(config, 'api_batch_size_dynamic', False): return getattr(config, 'api_batch_size', 8) - # Get maximum chunk character size from parsing rules + # Get maximum chunk character size from ALL parsing rules (not just default) + # This ensures we use the worst-case chunk size across all languages parsing_rules = getattr(config, 'parsing_rules', {}) - default_rules = parsing_rules.get('default', {}) - max_chunk_chars = default_rules.get('max_chunk_chars', 4000) + all_max_chunk_chars = [ + rule.get('max_chunk_chars', 0) + for rule in parsing_rules.values() + if isinstance(rule, dict) + ] + max_chunk_chars = max(all_max_chunk_chars) if all_max_chunk_chars else 4000 + if max_chunk_chars <= 0: + max_chunk_chars = 4000 # Final fallback # Get characters per token estimate chars_per_token = getattr(config, 'chars_per_token_estimate', 4) @@ -83,10 +90,19 @@ def calculate_dynamic_batch_size(config, embedder) -> int: # Get model's maximum token capacity model_max_tokens = getattr(embedder, 'max_tokens', 8192) - # Get utilization factor (default 80%) + # Get utilization factor (default 80%, max 95% to leave safety margin) utilization_factor = getattr(config, 'api_batch_size_utilization_factor', 0.8) - if utilization_factor <= 0 or utilization_factor > 1: - utilization_factor = 0.8 + if utilization_factor <= 0 or utilization_factor > 0.95: + if utilization_factor > 0.95: + logger.warning( + "Utilization factor %.2f exceeds safe limit 0.95. " + "Token estimation is approximate, high values risk API errors. " + "Clamping to 0.95.", + utilization_factor + ) + utilization_factor = 0.95 + else: + utilization_factor = 0.8 # Calculate safe token limit safe_token_limit = model_max_tokens * utilization_factor @@ -1378,7 +1394,7 @@ def generate_dense_embeddings_centralized( # Generate embeddings for this file's chunks batch_contents = [chunk.content for chunk in chunks] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE) + embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) # Assign chunk IDs and store embeddings for i, chunk in enumerate(chunks):