Enhance semantic search capabilities and configuration

- Added category support for programming and documentation languages in Config.
- Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent.
- Introduced functions for filtering results by category and determining file categories based on extensions.
- Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging.
- Enhanced the WatcherConfig to ignore additional common directories and files.
- Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
This commit is contained in:
catlog22
2026-01-02 15:01:20 +08:00
parent 92ed2524b7
commit 54fb7afdb2
7 changed files with 803 additions and 51 deletions

View File

@@ -30,8 +30,22 @@ class WatcherConfig:
"""Configuration for file watcher."""
debounce_ms: int = 1000
ignored_patterns: Set[str] = field(default_factory=lambda: {
".git", ".venv", "venv", "node_modules",
"__pycache__", ".codexlens", ".idea", ".vscode",
# Version control
".git", ".svn", ".hg",
# Python environments & cache
".venv", "venv", "env", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache",
# Node.js
"node_modules", "bower_components", ".npm", ".yarn",
# Build artifacts
"dist", "build", "out", "target", "bin", "obj", "_build", "coverage", "htmlcov",
# IDE & Editor
".idea", ".vscode", ".vs", ".eclipse",
# CodexLens internal
".codexlens",
# Package manager caches
".cache", ".parcel-cache", ".turbo", ".next", ".nuxt",
# Logs & temp
"logs", "tmp", "temp",
})
languages: Optional[List[str]] = None # None = all supported