feat: 增强索引树构建逻辑,支持递归检查子目录中的可索引文件

This commit is contained in:
catlog22
2026-01-13 11:08:48 +08:00
parent 8c2d39d517
commit 8a15e08944
2 changed files with 59 additions and 6 deletions

View File

@@ -535,10 +535,15 @@ def generate_embeddings(
# skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
# This significantly reduces CPU usage with minimal impact on metadata accuracy
# Load chunk stripping config from settings
from codexlens.config import Config
chunk_cfg = Config.load()
chunker = Chunker(config=ChunkConfig(
max_chunk_size=chunk_size,
overlap=overlap,
skip_token_count=True
skip_token_count=True,
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
))
# Log embedder info with endpoint count for multi-endpoint mode
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
"error": f"Invalid embedding backend: {embedding_backend}",
}
# Load chunk stripping config from settings
from codexlens.config import Config
chunk_cfg = Config.load()
chunker = Chunker(config=ChunkConfig(
max_chunk_size=chunk_size,
overlap=overlap,
skip_token_count=True
skip_token_count=True,
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
))
if progress_callback:
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
# Calculate dynamic batch size based on model capacity
from codexlens.config import Config
batch_config = Config.load()
batch_config = chunk_cfg # Reuse already loaded config
effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
if progress_callback and batch_config.api_batch_size_dynamic:

View File

@@ -412,7 +412,8 @@ class IndexTreeBuilder:
A directory is indexed if:
1. It's not in IGNORE_DIRS
2. It doesn't start with '.'
3. It contains at least one supported language file
3. It contains at least one supported language file, OR
4. It has subdirectories that contain supported files (transitive)
Args:
dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
# Check for supported files in this directory
source_files = self._iter_source_files(dir_path, languages)
return len(source_files) > 0
if len(source_files) > 0:
return True
# Check if any subdirectory has indexable files (transitive)
# This handles cases like 'src' which has no direct files but has 'src/codexlens'
for item in dir_path.iterdir():
if not item.is_dir():
continue
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
continue
# Recursively check subdirectories
if self._has_indexable_files_recursive(item, languages):
return True
return False
def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
"""Check if directory or any subdirectory has indexable files.
Args:
dir_path: Directory to check
languages: Optional language filter
Returns:
True if directory tree contains indexable files
"""
# Check for supported files in this directory
source_files = self._iter_source_files(dir_path, languages)
if len(source_files) > 0:
return True
# Check subdirectories
try:
for item in dir_path.iterdir():
if not item.is_dir():
continue
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
continue
if self._has_indexable_files_recursive(item, languages):
return True
except PermissionError:
pass
return False
def _build_level_parallel(
self,