feat: 增强索引树构建逻辑,支持递归检查子目录中的可索引文件

This commit is contained in:
catlog22
2026-01-13 11:08:48 +08:00
parent 8c2d39d517
commit 8a15e08944
2 changed files with 59 additions and 6 deletions

View File

@@ -535,10 +535,15 @@ def generate_embeddings(
# skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
# This significantly reduces CPU usage with minimal impact on metadata accuracy # This significantly reduces CPU usage with minimal impact on metadata accuracy
# Load chunk stripping config from settings
from codexlens.config import Config
chunk_cfg = Config.load()
chunker = Chunker(config=ChunkConfig( chunker = Chunker(config=ChunkConfig(
max_chunk_size=chunk_size, max_chunk_size=chunk_size,
overlap=overlap, overlap=overlap,
skip_token_count=True skip_token_count=True,
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
)) ))
# Log embedder info with endpoint count for multi-endpoint mode # Log embedder info with endpoint count for multi-endpoint mode
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
"error": f"Invalid embedding backend: {embedding_backend}", "error": f"Invalid embedding backend: {embedding_backend}",
} }
# Load chunk stripping config from settings
from codexlens.config import Config
chunk_cfg = Config.load()
chunker = Chunker(config=ChunkConfig( chunker = Chunker(config=ChunkConfig(
max_chunk_size=chunk_size, max_chunk_size=chunk_size,
overlap=overlap, overlap=overlap,
skip_token_count=True skip_token_count=True,
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
)) ))
if progress_callback: if progress_callback:
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
# Calculate dynamic batch size based on model capacity # Calculate dynamic batch size based on model capacity
from codexlens.config import Config batch_config = chunk_cfg # Reuse already loaded config
batch_config = Config.load()
effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
if progress_callback and batch_config.api_batch_size_dynamic: if progress_callback and batch_config.api_batch_size_dynamic:

View File

@@ -412,7 +412,8 @@ class IndexTreeBuilder:
A directory is indexed if: A directory is indexed if:
1. It's not in IGNORE_DIRS 1. It's not in IGNORE_DIRS
2. It doesn't start with '.' 2. It doesn't start with '.'
3. It contains at least one supported language file 3. It contains at least one supported language file, OR
4. It has subdirectories that contain supported files (transitive)
Args: Args:
dir_path: Directory to check dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
# Check for supported files in this directory # Check for supported files in this directory
source_files = self._iter_source_files(dir_path, languages) source_files = self._iter_source_files(dir_path, languages)
return len(source_files) > 0 if len(source_files) > 0:
return True
# Check if any subdirectory has indexable files (transitive)
# This handles cases like 'src' which has no direct files but has 'src/codexlens'
for item in dir_path.iterdir():
if not item.is_dir():
continue
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
continue
# Recursively check subdirectories
if self._has_indexable_files_recursive(item, languages):
return True
return False
def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
"""Check if directory or any subdirectory has indexable files.
Args:
dir_path: Directory to check
languages: Optional language filter
Returns:
True if directory tree contains indexable files
"""
# Check for supported files in this directory
source_files = self._iter_source_files(dir_path, languages)
if len(source_files) > 0:
return True
# Check subdirectories
try:
for item in dir_path.iterdir():
if not item.is_dir():
continue
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
continue
if self._has_indexable_files_recursive(item, languages):
return True
except PermissionError:
pass
return False
def _build_level_parallel( def _build_level_parallel(
self, self,