Enhance semantic search capabilities and configuration

- Added category support for programming and documentation languages in Config. - Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent. - Introduced functions for filtering results by category and determining file categories based on extensions. - Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging. - Enhanced the WatcherConfig to ignore additional common directories and files. - Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
2026-02-10 02:24:35 +08:00 · 2026-01-02 15:01:20 +08:00
parent 92ed2524b7
commit 54fb7afdb2
7 changed files with 803 additions and 51 deletions
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -17,6 +17,20 @@ except ImportError:
    def is_embedding_backend_available(_backend: str):  # type: ignore[no-redef]
        return False, "codexlens.semantic not available"

+try:
+    from codexlens.search.ranking import get_file_category
+except ImportError:
+    def get_file_category(path: str):  # type: ignore[no-redef]
+        """Fallback: map common extensions to category."""
+        ext = Path(path).suffix.lower()
+        code_exts = {".py", ".js", ".jsx", ".ts", ".tsx", ".java", ".go", ".c", ".cpp", ".rs"}
+        doc_exts = {".md", ".mdx", ".txt", ".rst"}
+        if ext in code_exts:
+            return "code"
+        elif ext in doc_exts:
+            return "doc"
+        return None
+
 logger = logging.getLogger(__name__)

 # Embedding batch size - larger values improve throughput on modern hardware
@@ -24,6 +38,22 @@ logger = logging.getLogger(__name__)
 EMBEDDING_BATCH_SIZE = 256


+def _build_categories_from_batch(chunk_batch: List[Tuple[Any, str]]) -> List[str]:
+    """Build categories list from chunk batch for index-level category filtering.
+
+    Args:
+        chunk_batch: List of (chunk, file_path) tuples
+
+    Returns:
+        List of category strings ('code' or 'doc'), defaulting to 'code' for unknown
+    """
+    categories = []
+    for _, file_path in chunk_batch:
+        cat = get_file_category(file_path)
+        categories.append(cat if cat else "code")  # Default to 'code' for unknown extensions
+    return categories
+
+
 def _cleanup_fastembed_resources() -> None:
    """Best-effort cleanup for fastembed/ONNX resources (no-op for other backends)."""
    try:
@@ -577,8 +607,9 @@ def generate_embeddings(
                                    batch_contents = [chunk.content for chunk, _ in chunk_batch]
                                    embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)

-                                    # Store embeddings
-                                    vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                    # Store embeddings with category
+                                    categories = _build_categories_from_batch(chunk_batch)
+                                    vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)

                                    files_seen.update(batch_files)
                                    total_chunks_created += len(chunk_batch)
@@ -630,7 +661,8 @@ def generate_embeddings(
                                        batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result()
                                        if embeddings_numpy is not None and error is None:
                                            # Write to DB in main thread (no contention)
-                                            vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                            categories = _build_categories_from_batch(chunk_batch)
+                                            vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
                                            total_chunks_created += len(chunk_batch)
                                        files_seen.update(batch_files)
                                        total_files_processed = len(files_seen)
@@ -667,7 +699,8 @@ def generate_embeddings(
                                try:
                                    batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result()
                                    if embeddings_numpy is not None and error is None:
-                                        vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                        categories = _build_categories_from_batch(chunk_batch)
+                                        vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
                                        total_chunks_created += len(chunk_batch)
                                    files_seen.update(batch_files)
                                    total_files_processed = len(files_seen)