mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-01 09:33:25 +08:00
Enhance semantic search capabilities and configuration
- Added category support for programming and documentation languages in Config. - Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent. - Introduced functions for filtering results by category and determining file categories based on extensions. - Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging. - Enhanced the WatcherConfig to ignore additional common directories and files. - Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
This commit is contained in:
@@ -67,15 +67,21 @@ class Config:
|
||||
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
|
||||
supported_languages: Dict[str, Dict[str, Any]] = field(
|
||||
default_factory=lambda: {
|
||||
"python": {"extensions": [".py"], "tree_sitter_language": "python"},
|
||||
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript"},
|
||||
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript"},
|
||||
"java": {"extensions": [".java"], "tree_sitter_language": "java"},
|
||||
"go": {"extensions": [".go"], "tree_sitter_language": "go"},
|
||||
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig"},
|
||||
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc"},
|
||||
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None},
|
||||
"text": {"extensions": [".txt"], "tree_sitter_language": None},
|
||||
# Source code languages (category: "code")
|
||||
"python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"},
|
||||
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"},
|
||||
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"},
|
||||
"java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"},
|
||||
"go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"},
|
||||
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"},
|
||||
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"},
|
||||
"c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"},
|
||||
"cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"},
|
||||
"rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"},
|
||||
# Documentation languages (category: "doc")
|
||||
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None, "category": "doc"},
|
||||
"text": {"extensions": [".txt"], "tree_sitter_language": None, "category": "doc"},
|
||||
"rst": {"extensions": [".rst"], "tree_sitter_language": None, "category": "doc"},
|
||||
}
|
||||
)
|
||||
parsing_rules: Dict[str, Dict[str, Any]] = field(
|
||||
@@ -141,6 +147,9 @@ class Config:
|
||||
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
|
||||
rrf_k: int = 60 # RRF constant (default 60)
|
||||
|
||||
# Category-based filtering to separate code/doc results
|
||||
enable_category_filter: bool = True # Enable code/doc result separation
|
||||
|
||||
# Multi-endpoint configuration for litellm backend
|
||||
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
|
||||
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
|
||||
@@ -210,6 +219,14 @@ class Config:
|
||||
return language_id
|
||||
return None
|
||||
|
||||
def category_for_path(self, path: str | Path) -> str | None:
|
||||
"""Get file category ('code' or 'doc') from a file path."""
|
||||
language = self.language_for_path(path)
|
||||
if language is None:
|
||||
return None
|
||||
spec = self.supported_languages.get(language, {})
|
||||
return spec.get("category")
|
||||
|
||||
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
|
||||
"""Get parsing rules for a specific language, falling back to defaults."""
|
||||
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}
|
||||
|
||||
Reference in New Issue
Block a user