From 8a15e08944aa6f4f766a79a8f9c01a990915850b Mon Sep 17 00:00:00 2001
From: catlog22 <catlog22@github.com>
Date: Tue, 13 Jan 2026 11:08:48 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E7=B4=A2=E5=BC=95?=
 =?UTF-8?q?=E6=A0=91=E6=9E=84=E5=BB=BA=E9=80=BB=E8=BE=91=EF=BC=8C=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E9=80=92=E5=BD=92=E6=A3=80=E6=9F=A5=E5=AD=90=E7=9B=AE?=
 =?UTF-8?q?=E5=BD=95=E4=B8=AD=E7=9A=84=E5=8F=AF=E7=B4=A2=E5=BC=95=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/codexlens/cli/embedding_manager.py    | 17 +++++--
 .../src/codexlens/storage/index_tree.py       | 48 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py
index 7667bf1c..bb6467f5 100644
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -535,10 +535,15 @@ def generate_embeddings(
 
         # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
         # This significantly reduces CPU usage with minimal impact on metadata accuracy
+        # Load chunk stripping config from settings
+        from codexlens.config import Config
+        chunk_cfg = Config.load()
         chunker = Chunker(config=ChunkConfig(
             max_chunk_size=chunk_size,
             overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
+            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
+            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
         ))
 
         # Log embedder info with endpoint count for multi-endpoint mode
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
                 "error": f"Invalid embedding backend: {embedding_backend}",
             }
 
+        # Load chunk stripping config from settings
+        from codexlens.config import Config
+        chunk_cfg = Config.load()
         chunker = Chunker(config=ChunkConfig(
             max_chunk_size=chunk_size,
             overlap=overlap,
-            skip_token_count=True
+            skip_token_count=True,
+            strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
+            strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
         ))
 
         if progress_callback:
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
             progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
 
         # Calculate dynamic batch size based on model capacity
-        from codexlens.config import Config
-        batch_config = Config.load()
+        batch_config = chunk_cfg  # Reuse already loaded config
         effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
 
         if progress_callback and batch_config.api_batch_size_dynamic:
diff --git a/codex-lens/src/codexlens/storage/index_tree.py b/codex-lens/src/codexlens/storage/index_tree.py
index 3cbe601e..40ad85e7 100644
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -412,7 +412,8 @@ class IndexTreeBuilder:
         A directory is indexed if:
         1. It's not in IGNORE_DIRS
         2. It doesn't start with '.'
-        3. It contains at least one supported language file
+        3. It contains at least one supported language file, OR
+        4. It has subdirectories that contain supported files (transitive)
 
         Args:
             dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
 
         # Check for supported files in this directory
         source_files = self._iter_source_files(dir_path, languages)
-        return len(source_files) > 0
+        if len(source_files) > 0:
+            return True
+
+        # Check if any subdirectory has indexable files (transitive)
+        # This handles cases like 'src' which has no direct files but has 'src/codexlens'
+        for item in dir_path.iterdir():
+            if not item.is_dir():
+                continue
+            if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                continue
+            # Recursively check subdirectories
+            if self._has_indexable_files_recursive(item, languages):
+                return True
+
+        return False
+
+    def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
+        """Check if directory or any subdirectory has indexable files.
+
+        Args:
+            dir_path: Directory to check
+            languages: Optional language filter
+
+        Returns:
+            True if directory tree contains indexable files
+        """
+        # Check for supported files in this directory
+        source_files = self._iter_source_files(dir_path, languages)
+        if len(source_files) > 0:
+            return True
+
+        # Check subdirectories
+        try:
+            for item in dir_path.iterdir():
+                if not item.is_dir():
+                    continue
+                if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                    continue
+                if self._has_indexable_files_recursive(item, languages):
+                    return True
+        except PermissionError:
+            pass
+
+        return False
 
     def _build_level_parallel(
         self,