mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-15 02:42:45 +08:00
feat: 增强索引树构建逻辑,支持递归检查子目录中的可索引文件
This commit is contained in:
@@ -535,10 +535,15 @@ def generate_embeddings(
|
|||||||
|
|
||||||
# skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
|
# skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
|
||||||
# This significantly reduces CPU usage with minimal impact on metadata accuracy
|
# This significantly reduces CPU usage with minimal impact on metadata accuracy
|
||||||
|
# Load chunk stripping config from settings
|
||||||
|
from codexlens.config import Config
|
||||||
|
chunk_cfg = Config.load()
|
||||||
chunker = Chunker(config=ChunkConfig(
|
chunker = Chunker(config=ChunkConfig(
|
||||||
max_chunk_size=chunk_size,
|
max_chunk_size=chunk_size,
|
||||||
overlap=overlap,
|
overlap=overlap,
|
||||||
skip_token_count=True
|
skip_token_count=True,
|
||||||
|
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
|
||||||
|
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
|
||||||
))
|
))
|
||||||
|
|
||||||
# Log embedder info with endpoint count for multi-endpoint mode
|
# Log embedder info with endpoint count for multi-endpoint mode
|
||||||
@@ -1307,10 +1312,15 @@ def generate_dense_embeddings_centralized(
|
|||||||
"error": f"Invalid embedding backend: {embedding_backend}",
|
"error": f"Invalid embedding backend: {embedding_backend}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Load chunk stripping config from settings
|
||||||
|
from codexlens.config import Config
|
||||||
|
chunk_cfg = Config.load()
|
||||||
chunker = Chunker(config=ChunkConfig(
|
chunker = Chunker(config=ChunkConfig(
|
||||||
max_chunk_size=chunk_size,
|
max_chunk_size=chunk_size,
|
||||||
overlap=overlap,
|
overlap=overlap,
|
||||||
skip_token_count=True
|
skip_token_count=True,
|
||||||
|
strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True),
|
||||||
|
strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True),
|
||||||
))
|
))
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
@@ -1319,8 +1329,7 @@ def generate_dense_embeddings_centralized(
|
|||||||
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
|
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
|
||||||
|
|
||||||
# Calculate dynamic batch size based on model capacity
|
# Calculate dynamic batch size based on model capacity
|
||||||
from codexlens.config import Config
|
batch_config = chunk_cfg # Reuse already loaded config
|
||||||
batch_config = Config.load()
|
|
||||||
effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
|
effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder)
|
||||||
|
|
||||||
if progress_callback and batch_config.api_batch_size_dynamic:
|
if progress_callback and batch_config.api_batch_size_dynamic:
|
||||||
|
|||||||
@@ -412,7 +412,8 @@ class IndexTreeBuilder:
|
|||||||
A directory is indexed if:
|
A directory is indexed if:
|
||||||
1. It's not in IGNORE_DIRS
|
1. It's not in IGNORE_DIRS
|
||||||
2. It doesn't start with '.'
|
2. It doesn't start with '.'
|
||||||
3. It contains at least one supported language file
|
3. It contains at least one supported language file, OR
|
||||||
|
4. It has subdirectories that contain supported files (transitive)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dir_path: Directory to check
|
dir_path: Directory to check
|
||||||
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
|
|||||||
|
|
||||||
# Check for supported files in this directory
|
# Check for supported files in this directory
|
||||||
source_files = self._iter_source_files(dir_path, languages)
|
source_files = self._iter_source_files(dir_path, languages)
|
||||||
return len(source_files) > 0
|
if len(source_files) > 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if any subdirectory has indexable files (transitive)
|
||||||
|
# This handles cases like 'src' which has no direct files but has 'src/codexlens'
|
||||||
|
for item in dir_path.iterdir():
|
||||||
|
if not item.is_dir():
|
||||||
|
continue
|
||||||
|
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
|
||||||
|
continue
|
||||||
|
# Recursively check subdirectories
|
||||||
|
if self._has_indexable_files_recursive(item, languages):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
|
||||||
|
"""Check if directory or any subdirectory has indexable files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dir_path: Directory to check
|
||||||
|
languages: Optional language filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if directory tree contains indexable files
|
||||||
|
"""
|
||||||
|
# Check for supported files in this directory
|
||||||
|
source_files = self._iter_source_files(dir_path, languages)
|
||||||
|
if len(source_files) > 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check subdirectories
|
||||||
|
try:
|
||||||
|
for item in dir_path.iterdir():
|
||||||
|
if not item.is_dir():
|
||||||
|
continue
|
||||||
|
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
|
||||||
|
continue
|
||||||
|
if self._has_indexable_files_recursive(item, languages):
|
||||||
|
return True
|
||||||
|
except PermissionError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def _build_level_parallel(
|
def _build_level_parallel(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user