feat: 增强索引树构建逻辑，支持递归检查子目录中的可索引文件

2026-02-15 02:42:45 +08:00 · 2026-01-13 11:08:48 +08:00
parent 8c2d39d517
commit 8a15e08944
2 changed files with 59 additions and 6 deletions
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -535,10 +535,15 @@ def generate_embeddings(
        # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
        # This significantly reduces CPU usage with minimal impact on metadata accuracy
        # Load chunk stripping config from settings
        from codexlens.config import Config
        chunk_cfg = Config.load()
        chunker = Chunker(config=ChunkConfig(
            max_chunk_size=chunk_size,
            overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
        ))
        # Log embedder info with endpoint count for multi-endpoint mode
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
                "error": f"Invalid embedding backend: {embedding_backend}",
            }
        # Load chunk stripping config from settings
        from codexlens.config import Config
        chunk_cfg = Config.load()
        chunker = Chunker(config=ChunkConfig(
            max_chunk_size=chunk_size,
            overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
        ))
        if progress_callback:
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
            progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
        # Calculate dynamic batch size based on model capacity
-        from codexlens.config import Config
+        batch_config = chunk_cfg  # Reuse already loaded config
        batch_config = Config.load()
        effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
        if progress_callback and batch_config.api_batch_size_dynamic:
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -412,7 +412,8 @@ class IndexTreeBuilder:
        A directory is indexed if:
        1. It's not in IGNORE_DIRS
        2. It doesn't start with '.'
-        3. It contains at least one supported language file
+        3. It contains at least one supported language file, OR
        4. It has subdirectories that contain supported files (transitive)
        Args:
            dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
        # Check for supported files in this directory
        source_files = self._iter_source_files(dir_path, languages)
-        return len(source_files) > 0
+        if len(source_files) > 0:
            return True
        # Check if any subdirectory has indexable files (transitive)
        # This handles cases like 'src' which has no direct files but has 'src/codexlens'
        for item in dir_path.iterdir():
            if not item.is_dir():
                continue
            if item.name in self.IGNORE_DIRS or item.name.startswith("."):
                continue
            # Recursively check subdirectories
            if self._has_indexable_files_recursive(item, languages):
                return True
        return False
    def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
        """Check if directory or any subdirectory has indexable files.
        Args:
            dir_path: Directory to check
            languages: Optional language filter
        Returns:
            True if directory tree contains indexable files
        """
        # Check for supported files in this directory
        source_files = self._iter_source_files(dir_path, languages)
        if len(source_files) > 0:
            return True
        # Check subdirectories
        try:
            for item in dir_path.iterdir():
                if not item.is_dir():
                    continue
                if item.name in self.IGNORE_DIRS or item.name.startswith("."):
                    continue
                if self._has_indexable_files_recursive(item, languages):
                    return True
        except PermissionError:
            pass
        return False
    def _build_level_parallel(
        self,