From 8a15e08944aa6f4f766a79a8f9c01a990915850b Mon Sep 17 00:00:00 2001 From: catlog22 Date: Tue, 13 Jan 2026 11:08:48 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E7=B4=A2=E5=BC=95?= =?UTF-8?q?=E6=A0=91=E6=9E=84=E5=BB=BA=E9=80=BB=E8=BE=91=EF=BC=8C=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=80=92=E5=BD=92=E6=A3=80=E6=9F=A5=E5=AD=90=E7=9B=AE?= =?UTF-8?q?=E5=BD=95=E4=B8=AD=E7=9A=84=E5=8F=AF=E7=B4=A2=E5=BC=95=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/codexlens/cli/embedding_manager.py | 17 +++++-- .../src/codexlens/storage/index_tree.py | 48 ++++++++++++++++++- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py index 7667bf1c..bb6467f5 100644 --- a/codex-lens/src/codexlens/cli/embedding_manager.py +++ b/codex-lens/src/codexlens/cli/embedding_manager.py @@ -535,10 +535,15 @@ def generate_embeddings( # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken # This significantly reduces CPU usage with minimal impact on metadata accuracy + # Load chunk stripping config from settings + from codexlens.config import Config + chunk_cfg = Config.load() chunker = Chunker(config=ChunkConfig( max_chunk_size=chunk_size, overlap=overlap, - skip_token_count=True + skip_token_count=True, + strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True), + strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True), )) # Log embedder info with endpoint count for multi-endpoint mode @@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized( "error": f"Invalid embedding backend: {embedding_backend}", } + # Load chunk stripping config from settings + from codexlens.config import Config + chunk_cfg = Config.load() chunker = Chunker(config=ChunkConfig( max_chunk_size=chunk_size, overlap=overlap, - skip_token_count=True + skip_token_count=True, + strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True), + strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True), )) if progress_callback: @@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized( progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") # Calculate dynamic batch size based on model capacity - from codexlens.config import Config - batch_config = Config.load() + batch_config = chunk_cfg # Reuse already loaded config effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) if progress_callback and batch_config.api_batch_size_dynamic: diff --git a/codex-lens/src/codexlens/storage/index_tree.py b/codex-lens/src/codexlens/storage/index_tree.py index 3cbe601e..40ad85e7 100644 --- a/codex-lens/src/codexlens/storage/index_tree.py +++ b/codex-lens/src/codexlens/storage/index_tree.py @@ -412,7 +412,8 @@ class IndexTreeBuilder: A directory is indexed if: 1. It's not in IGNORE_DIRS 2. It doesn't start with '.' - 3. It contains at least one supported language file + 3. It contains at least one supported language file, OR + 4. It has subdirectories that contain supported files (transitive) Args: dir_path: Directory to check @@ -427,7 +428,50 @@ class IndexTreeBuilder: # Check for supported files in this directory source_files = self._iter_source_files(dir_path, languages) - return len(source_files) > 0 + if len(source_files) > 0: + return True + + # Check if any subdirectory has indexable files (transitive) + # This handles cases like 'src' which has no direct files but has 'src/codexlens' + for item in dir_path.iterdir(): + if not item.is_dir(): + continue + if item.name in self.IGNORE_DIRS or item.name.startswith("."): + continue + # Recursively check subdirectories + if self._has_indexable_files_recursive(item, languages): + return True + + return False + + def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool: + """Check if directory or any subdirectory has indexable files. + + Args: + dir_path: Directory to check + languages: Optional language filter + + Returns: + True if directory tree contains indexable files + """ + # Check for supported files in this directory + source_files = self._iter_source_files(dir_path, languages) + if len(source_files) > 0: + return True + + # Check subdirectories + try: + for item in dir_path.iterdir(): + if not item.is_dir(): + continue + if item.name in self.IGNORE_DIRS or item.name.startswith("."): + continue + if self._has_indexable_files_recursive(item, languages): + return True + except PermissionError: + pass + + return False def _build_level_parallel( self,