feat: 增强索引树构建逻辑，支持递归检查子目录中的可索引文件

2026-02-05 01:50:27 +08:00 · 2026-01-13 11:08:48 +08:00
parent 8c2d39d517
commit 8a15e08944
2 changed files with 59 additions and 6 deletions
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -535,10 +535,15 @@ def generate_embeddings(

        # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
        # This significantly reduces CPU usage with minimal impact on metadata accuracy
+        # Load chunk stripping config from settings
+        from codexlens.config import Config
+        chunk_cfg = Config.load()
        chunker = Chunker(config=ChunkConfig(
            max_chunk_size=chunk_size,
            overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
+            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
+            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
        ))

        # Log embedder info with endpoint count for multi-endpoint mode
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
                "error": f"Invalid embedding backend: {embedding_backend}",
            }

+        # Load chunk stripping config from settings
+        from codexlens.config import Config
+        chunk_cfg = Config.load()
        chunker = Chunker(config=ChunkConfig(
            max_chunk_size=chunk_size,
            overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
+            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
+            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
        ))

        if progress_callback:
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
            progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")

        # Calculate dynamic batch size based on model capacity
-        from codexlens.config import Config
-        batch_config = Config.load()
+        batch_config = chunk_cfg  # Reuse already loaded config
        effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)

        if progress_callback and batch_config.api_batch_size_dynamic:
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -412,7 +412,8 @@ class IndexTreeBuilder:
        A directory is indexed if:
        1. It's not in IGNORE_DIRS
        2. It doesn't start with '.'
-        3. It contains at least one supported language file
+        3. It contains at least one supported language file, OR
+        4. It has subdirectories that contain supported files (transitive)

        Args:
            dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:

        # Check for supported files in this directory
        source_files = self._iter_source_files(dir_path, languages)
-        return len(source_files) > 0
+        if len(source_files) > 0:
+            return True
+
+        # Check if any subdirectory has indexable files (transitive)
+        # This handles cases like 'src' which has no direct files but has 'src/codexlens'
+        for item in dir_path.iterdir():
+            if not item.is_dir():
+                continue
+            if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                continue
+            # Recursively check subdirectories
+            if self._has_indexable_files_recursive(item, languages):
+                return True
+
+        return False
+
+    def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
+        """Check if directory or any subdirectory has indexable files.
+
+        Args:
+            dir_path: Directory to check
+            languages: Optional language filter
+
+        Returns:
+            True if directory tree contains indexable files
+        """
+        # Check for supported files in this directory
+        source_files = self._iter_source_files(dir_path, languages)
+        if len(source_files) > 0:
+            return True
+
+        # Check subdirectories
+        try:
+            for item in dir_path.iterdir():
+                if not item.is_dir():
+                    continue
+                if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                    continue
+                if self._has_indexable_files_recursive(item, languages):
+                    return True
+        except PermissionError:
+            pass
+
+        return False

    def _build_level_parallel(
        self,