Enhance semantic search capabilities and configuration

- Added category support for programming and documentation languages in Config.
- Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent.
- Introduced functions for filtering results by category and determining file categories based on extensions.
- Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging.
- Enhanced the WatcherConfig to ignore additional common directories and files.
- Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
This commit is contained in:
catlog22
2026-01-02 15:01:20 +08:00
parent 92ed2524b7
commit 54fb7afdb2
7 changed files with 803 additions and 51 deletions

View File

@@ -67,15 +67,21 @@ class Config:
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
supported_languages: Dict[str, Dict[str, Any]] = field(
default_factory=lambda: {
"python": {"extensions": [".py"], "tree_sitter_language": "python"},
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript"},
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript"},
"java": {"extensions": [".java"], "tree_sitter_language": "java"},
"go": {"extensions": [".go"], "tree_sitter_language": "go"},
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig"},
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc"},
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None},
"text": {"extensions": [".txt"], "tree_sitter_language": None},
# Source code languages (category: "code")
"python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"},
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"},
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"},
"java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"},
"go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"},
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"},
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"},
"c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"},
"cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"},
"rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"},
# Documentation languages (category: "doc")
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None, "category": "doc"},
"text": {"extensions": [".txt"], "tree_sitter_language": None, "category": "doc"},
"rst": {"extensions": [".rst"], "tree_sitter_language": None, "category": "doc"},
}
)
parsing_rules: Dict[str, Dict[str, Any]] = field(
@@ -141,6 +147,9 @@ class Config:
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
rrf_k: int = 60 # RRF constant (default 60)
# Category-based filtering to separate code/doc results
enable_category_filter: bool = True # Enable code/doc result separation
# Multi-endpoint configuration for litellm backend
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
@@ -210,6 +219,14 @@ class Config:
return language_id
return None
def category_for_path(self, path: str | Path) -> str | None:
"""Get file category ('code' or 'doc') from a file path."""
language = self.language_for_path(path)
if language is None:
return None
spec = self.supported_languages.get(language, {})
return spec.get("category")
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
"""Get parsing rules for a specific language, falling back to defaults."""
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}