Enhance semantic search capabilities and configuration

- Added category support for programming and documentation languages in Config.
- Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent.
- Introduced functions for filtering results by category and determining file categories based on extensions.
- Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging.
- Enhanced the WatcherConfig to ignore additional common directories and files.
- Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
This commit is contained in:
catlog22
2026-01-02 15:01:20 +08:00
parent 92ed2524b7
commit 54fb7afdb2
7 changed files with 803 additions and 51 deletions

View File

@@ -17,6 +17,20 @@ except ImportError:
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
return False, "codexlens.semantic not available"
try:
from codexlens.search.ranking import get_file_category
except ImportError:
def get_file_category(path: str): # type: ignore[no-redef]
"""Fallback: map common extensions to category."""
ext = Path(path).suffix.lower()
code_exts = {".py", ".js", ".jsx", ".ts", ".tsx", ".java", ".go", ".c", ".cpp", ".rs"}
doc_exts = {".md", ".mdx", ".txt", ".rst"}
if ext in code_exts:
return "code"
elif ext in doc_exts:
return "doc"
return None
logger = logging.getLogger(__name__)
# Embedding batch size - larger values improve throughput on modern hardware
@@ -24,6 +38,22 @@ logger = logging.getLogger(__name__)
EMBEDDING_BATCH_SIZE = 256
def _build_categories_from_batch(chunk_batch: List[Tuple[Any, str]]) -> List[str]:
"""Build categories list from chunk batch for index-level category filtering.
Args:
chunk_batch: List of (chunk, file_path) tuples
Returns:
List of category strings ('code' or 'doc'), defaulting to 'code' for unknown
"""
categories = []
for _, file_path in chunk_batch:
cat = get_file_category(file_path)
categories.append(cat if cat else "code") # Default to 'code' for unknown extensions
return categories
def _cleanup_fastembed_resources() -> None:
"""Best-effort cleanup for fastembed/ONNX resources (no-op for other backends)."""
try:
@@ -577,8 +607,9 @@ def generate_embeddings(
batch_contents = [chunk.content for chunk, _ in chunk_batch]
embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
# Store embeddings
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
# Store embeddings with category
categories = _build_categories_from_batch(chunk_batch)
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
files_seen.update(batch_files)
total_chunks_created += len(chunk_batch)
@@ -630,7 +661,8 @@ def generate_embeddings(
batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result()
if embeddings_numpy is not None and error is None:
# Write to DB in main thread (no contention)
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
categories = _build_categories_from_batch(chunk_batch)
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
total_chunks_created += len(chunk_batch)
files_seen.update(batch_files)
total_files_processed = len(files_seen)
@@ -667,7 +699,8 @@ def generate_embeddings(
try:
batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result()
if embeddings_numpy is not None and error is None:
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
categories = _build_categories_from_batch(chunk_batch)
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
total_chunks_created += len(chunk_batch)
files_seen.update(batch_files)
total_files_processed = len(files_seen)

View File

@@ -67,15 +67,21 @@ class Config:
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
supported_languages: Dict[str, Dict[str, Any]] = field(
default_factory=lambda: {
"python": {"extensions": [".py"], "tree_sitter_language": "python"},
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript"},
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript"},
"java": {"extensions": [".java"], "tree_sitter_language": "java"},
"go": {"extensions": [".go"], "tree_sitter_language": "go"},
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig"},
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc"},
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None},
"text": {"extensions": [".txt"], "tree_sitter_language": None},
# Source code languages (category: "code")
"python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"},
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"},
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"},
"java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"},
"go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"},
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"},
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"},
"c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"},
"cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"},
"rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"},
# Documentation languages (category: "doc")
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None, "category": "doc"},
"text": {"extensions": [".txt"], "tree_sitter_language": None, "category": "doc"},
"rst": {"extensions": [".rst"], "tree_sitter_language": None, "category": "doc"},
}
)
parsing_rules: Dict[str, Dict[str, Any]] = field(
@@ -141,6 +147,9 @@ class Config:
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
rrf_k: int = 60 # RRF constant (default 60)
# Category-based filtering to separate code/doc results
enable_category_filter: bool = True # Enable code/doc result separation
# Multi-endpoint configuration for litellm backend
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
@@ -210,6 +219,14 @@ class Config:
return language_id
return None
def category_for_path(self, path: str | Path) -> str | None:
"""Get file category ('code' or 'doc') from a file path."""
language = self.language_for_path(path)
if language is None:
return None
spec = self.supported_languages.get(language, {})
return spec.get("category")
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
"""Get parsing rules for a specific language, falling back to defaults."""
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}

View File

@@ -35,8 +35,11 @@ from codexlens.entities import SearchResult
from codexlens.search.ranking import (
DEFAULT_WEIGHTS,
FTS_FALLBACK_WEIGHTS,
QueryIntent,
apply_symbol_boost,
cross_encoder_rerank,
detect_query_intent,
filter_results_by_category,
get_rrf_weights,
reciprocal_rank_fusion,
rerank_results,
@@ -131,6 +134,16 @@ class HybridSearchEngine:
except OSError:
return []
# Detect query intent early for category filtering at index level
query_intent = detect_query_intent(query)
# Map intent to category for vector search:
# - KEYWORD (code intent) -> filter to 'code' only
# - SEMANTIC (doc intent) -> no filter (allow docs to surface)
# - MIXED -> no filter (allow all)
vector_category: Optional[str] = None
if query_intent == QueryIntent.KEYWORD:
vector_category = "code"
# Determine which backends to use
backends = {}
@@ -183,7 +196,7 @@ class HybridSearchEngine:
# Execute parallel searches
with timer("parallel_search_total", self.logger):
results_map = self._search_parallel(index_path, query, backends, limit)
results_map = self._search_parallel(index_path, query, backends, limit, vector_category)
# Provide helpful message if pure-vector mode returns no results
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
@@ -263,6 +276,19 @@ class HybridSearchEngine:
top_k=self._config.reranker_top_k,
)
# Apply category filtering to avoid code/doc pollution
# This ensures KEYWORD queries return code files, SEMANTIC queries prefer docs
enable_category_filter = (
self._config is None
or getattr(self._config, 'enable_category_filter', True)
)
if enable_category_filter and not pure_vector:
with timer("category_filter", self.logger):
query_intent = detect_query_intent(query)
fused_results = filter_results_by_category(
fused_results, query_intent, allow_mixed=True
)
# Apply final limit
return fused_results[:limit]
@@ -361,6 +387,7 @@ class HybridSearchEngine:
query: str,
backends: Dict[str, bool],
limit: int,
category: Optional[str] = None,
) -> Dict[str, List[SearchResult]]:
"""Execute parallel searches across enabled backends.
@@ -369,6 +396,7 @@ class HybridSearchEngine:
query: FTS5 query string
backends: Dictionary of backend name to enabled flag
limit: Results limit per backend
category: Optional category filter for vector search ('code' or 'doc')
Returns:
Dictionary mapping source name to results list
@@ -399,7 +427,7 @@ class HybridSearchEngine:
if backends.get("vector"):
submit_times["vector"] = time.perf_counter()
future = executor.submit(
self._search_vector, index_path, query, limit
self._search_vector, index_path, query, limit, category
)
future_to_source[future] = "vector"
@@ -490,7 +518,7 @@ class HybridSearchEngine:
return []
def _search_vector(
self, index_path: Path, query: str, limit: int
self, index_path: Path, query: str, limit: int, category: Optional[str] = None
) -> List[SearchResult]:
"""Execute vector similarity search using semantic embeddings.
@@ -498,6 +526,7 @@ class HybridSearchEngine:
index_path: Path to _index.db file
query: Natural language query string
limit: Maximum results
category: Optional category filter ('code' or 'doc')
Returns:
List of SearchResult objects ordered by semantic similarity
@@ -616,6 +645,7 @@ class HybridSearchEngine:
top_k=limit,
min_score=0.0, # Return all results, let RRF handle filtering
return_full_content=True,
category=category,
)
self.logger.debug(
"[TIMING] vector_similarity_search: %.2fms (%d results)",

View File

@@ -9,7 +9,8 @@ from __future__ import annotations
import re
import math
from enum import Enum
from typing import Any, Dict, List
from pathlib import Path
from typing import Any, Dict, List, Optional
from codexlens.entities import SearchResult, AdditionalLocation
@@ -132,6 +133,83 @@ def get_rrf_weights(
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
# File extensions to category mapping for fast lookup
_EXT_TO_CATEGORY: Dict[str, str] = {
# Code extensions
".py": "code", ".js": "code", ".jsx": "code", ".ts": "code", ".tsx": "code",
".java": "code", ".go": "code", ".zig": "code", ".m": "code", ".mm": "code",
".c": "code", ".h": "code", ".cc": "code", ".cpp": "code", ".hpp": "code", ".cxx": "code",
".rs": "code",
# Doc extensions
".md": "doc", ".mdx": "doc", ".txt": "doc", ".rst": "doc",
}
def get_file_category(path: str) -> Optional[str]:
"""Get file category ('code' or 'doc') from path extension.
Args:
path: File path string
Returns:
'code', 'doc', or None if unknown
"""
ext = Path(path).suffix.lower()
return _EXT_TO_CATEGORY.get(ext)
def filter_results_by_category(
results: List[SearchResult],
intent: QueryIntent,
allow_mixed: bool = True,
) -> List[SearchResult]:
"""Filter results by category based on query intent.
Strategy:
- KEYWORD (code intent): Only return code files
- SEMANTIC (doc intent): Prefer docs, but allow code if allow_mixed=True
- MIXED: Return all results
Args:
results: List of SearchResult objects
intent: Query intent from detect_query_intent()
allow_mixed: If True, SEMANTIC intent includes code files with lower priority
Returns:
Filtered and re-ranked list of SearchResult objects
"""
if not results or intent == QueryIntent.MIXED:
return results
code_results = []
doc_results = []
unknown_results = []
for r in results:
category = get_file_category(r.path)
if category == "code":
code_results.append(r)
elif category == "doc":
doc_results.append(r)
else:
unknown_results.append(r)
if intent == QueryIntent.KEYWORD:
# Code intent: return only code files + unknown (might be code)
filtered = code_results + unknown_results
elif intent == QueryIntent.SEMANTIC:
if allow_mixed:
# Semantic intent with mixed: docs first, then code
filtered = doc_results + code_results + unknown_results
else:
# Semantic intent strict: only docs
filtered = doc_results + unknown_results
else:
filtered = results
return filtered
def simple_weighted_fusion(
results_map: Dict[str, List[SearchResult]],
weights: Dict[str, float] = None,

View File

@@ -155,6 +155,7 @@ class VectorStore:
content TEXT NOT NULL,
embedding BLOB NOT NULL,
metadata TEXT,
category TEXT DEFAULT 'code',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
@@ -162,6 +163,10 @@ class VectorStore:
CREATE INDEX IF NOT EXISTS idx_chunks_file
ON semantic_chunks(file_path)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_chunks_category
ON semantic_chunks(category)
""")
# Model configuration table - tracks which model generated the embeddings
conn.execute("""
CREATE TABLE IF NOT EXISTS embeddings_config (
@@ -177,6 +182,8 @@ class VectorStore:
# Migration: Add backend column to existing tables
self._migrate_backend_column(conn)
# Migration: Add category column
self._migrate_category_column(conn)
conn.commit()
@@ -197,6 +204,28 @@ class VectorStore:
ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'
""")
def _migrate_category_column(self, conn: sqlite3.Connection) -> None:
"""Add category column to existing semantic_chunks table if not present.
Args:
conn: Active SQLite connection
"""
# Check if category column exists
cursor = conn.execute("PRAGMA table_info(semantic_chunks)")
columns = [row[1] for row in cursor.fetchall()]
if 'category' not in columns:
logger.info("Migrating semantic_chunks table: adding category column")
conn.execute("""
ALTER TABLE semantic_chunks
ADD COLUMN category TEXT DEFAULT 'code'
""")
# Create index for fast category filtering
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_chunks_category
ON semantic_chunks(category)
""")
def _init_ann_index(self) -> None:
"""Initialize ANN index (lazy loading from existing data)."""
if not HNSWLIB_AVAILABLE:
@@ -390,9 +419,16 @@ class VectorStore:
self._ann_index = None
return False
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
def add_chunk(
self, chunk: SemanticChunk, file_path: str, category: str = "code"
) -> int:
"""Add a single chunk with its embedding.
Args:
chunk: SemanticChunk with embedding
file_path: Path to the source file
category: File category ('code' or 'doc'), default 'code'
Returns:
The inserted chunk ID.
"""
@@ -406,10 +442,10 @@ class VectorStore:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
VALUES (?, ?, ?, ?, ?)
""",
(file_path, chunk.content, embedding_blob, metadata_json)
(file_path, chunk.content, embedding_blob, metadata_json, category)
)
conn.commit()
chunk_id = cursor.lastrowid or 0
@@ -427,9 +463,16 @@ class VectorStore:
self._invalidate_cache()
return chunk_id
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
def add_chunks(
self, chunks: List[SemanticChunk], file_path: str, category: str = "code"
) -> List[int]:
"""Add multiple chunks with embeddings (batch insert).
Args:
chunks: List of SemanticChunk objects with embeddings
file_path: Path to the source file
category: File category ('code' or 'doc'), default 'code'
Returns:
List of inserted chunk IDs.
"""
@@ -445,7 +488,7 @@ class VectorStore:
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
embeddings_list.append(embedding_arr)
# Batch insert to SQLite
@@ -456,8 +499,8 @@ class VectorStore:
conn.executemany(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
VALUES (?, ?, ?, ?, ?)
""",
batch_data
)
@@ -484,6 +527,7 @@ class VectorStore:
chunks_with_paths: List[Tuple[SemanticChunk, str]],
update_ann: bool = True,
auto_save_ann: bool = True,
categories: Optional[List[str]] = None,
) -> List[int]:
"""Batch insert chunks from multiple files in a single transaction.
@@ -494,6 +538,8 @@ class VectorStore:
update_ann: If True, update ANN index with new vectors (default: True)
auto_save_ann: If True, save ANN index after update (default: True).
Set to False for bulk inserts to reduce I/O overhead.
categories: Optional list of categories per chunk. If None, defaults to 'code'.
If provided, must match length of chunks_with_paths.
Returns:
List of inserted chunk IDs
@@ -503,10 +549,17 @@ class VectorStore:
batch_size = len(chunks_with_paths)
# Validate categories if provided
if categories is not None and len(categories) != batch_size:
raise ValueError(
f"categories length ({len(categories)}) must match "
f"chunks_with_paths length ({batch_size})"
)
# Prepare batch data
batch_data = []
embeddings_list = []
for chunk, file_path in chunks_with_paths:
for i, (chunk, file_path) in enumerate(chunks_with_paths):
if chunk.embedding is None:
raise ValueError("All chunks must have embeddings")
# Optimize: avoid repeated np.array() if already numpy
@@ -516,7 +569,8 @@ class VectorStore:
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
category = categories[i] if categories else "code"
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
embeddings_list.append(embedding_arr)
# Batch insert to SQLite in single transaction
@@ -529,8 +583,8 @@ class VectorStore:
conn.executemany(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
VALUES (?, ?, ?, ?, ?)
""",
batch_data
)
@@ -565,6 +619,7 @@ class VectorStore:
embeddings_matrix: np.ndarray,
update_ann: bool = True,
auto_save_ann: bool = True,
categories: Optional[List[str]] = None,
) -> List[int]:
"""Batch insert chunks with pre-computed numpy embeddings matrix.
@@ -576,6 +631,7 @@ class VectorStore:
embeddings_matrix: Pre-computed embeddings as (N, D) numpy array
update_ann: If True, update ANN index with new vectors (default: True)
auto_save_ann: If True, save ANN index after update (default: True)
categories: Optional list of categories per chunk. If None, defaults to 'code'.
Returns:
List of inserted chunk IDs
@@ -591,6 +647,13 @@ class VectorStore:
f"{embeddings_matrix.shape[0]} embeddings"
)
# Validate categories if provided
if categories is not None and len(categories) != batch_size:
raise ValueError(
f"categories length ({len(categories)}) must match "
f"chunks_with_paths length ({batch_size})"
)
# Ensure float32 format
embeddings_matrix = embeddings_matrix.astype(np.float32)
@@ -600,7 +663,8 @@ class VectorStore:
embedding_arr = embeddings_matrix[i]
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
category = categories[i] if categories else "code"
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
# Batch insert to SQLite in single transaction
with sqlite3.connect(self.db_path) as conn:
@@ -612,8 +676,8 @@ class VectorStore:
conn.executemany(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
VALUES (?, ?, ?, ?, ?)
""",
batch_data
)
@@ -765,6 +829,7 @@ class VectorStore:
top_k: int = 10,
min_score: float = 0.0,
return_full_content: bool = True,
category: Optional[str] = None,
) -> List[SearchResult]:
"""Find chunks most similar to query embedding.
@@ -776,6 +841,7 @@ class VectorStore:
top_k: Maximum results to return.
min_score: Minimum cosine similarity score in [0.0, 1.0].
return_full_content: If True, return full code block content.
category: Optional category filter ('code' or 'doc'). If None, returns all.
Returns:
List of SearchResult ordered by similarity (highest first).
@@ -796,14 +862,14 @@ class VectorStore:
):
try:
return self._search_with_ann(
query_vec, top_k, min_score, return_full_content
query_vec, top_k, min_score, return_full_content, category
)
except Exception as e:
logger.warning("ANN search failed, falling back to brute-force: %s", e)
# Fallback to brute-force search (O(N))
return self._search_brute_force(
query_vec, top_k, min_score, return_full_content
query_vec, top_k, min_score, return_full_content, category
)
def _search_with_ann(
@@ -812,6 +878,7 @@ class VectorStore:
top_k: int,
min_score: float,
return_full_content: bool,
category: Optional[str] = None,
) -> List[SearchResult]:
"""Search using HNSW index (O(log N)).
@@ -820,13 +887,16 @@ class VectorStore:
top_k: Maximum results to return
min_score: Minimum cosine similarity score in [0.0, 1.0]
return_full_content: If True, return full code block content
category: Optional category filter ('code' or 'doc')
Returns:
List of SearchResult ordered by similarity (highest first)
"""
# Limit top_k to available vectors to prevent hnswlib error
ann_count = self._ann_index.count()
effective_top_k = min(top_k, ann_count) if ann_count > 0 else 0
# When category filtering, fetch more candidates to compensate for filtering
fetch_k = top_k * 3 if category else top_k
effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0
if effective_top_k == 0:
return []
@@ -875,8 +945,12 @@ class VectorStore:
top_ids = [f[0] for f in filtered]
top_scores = [f[1] for f in filtered]
# Fetch content from SQLite
return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)
# Fetch content from SQLite with category filtering
results = self._fetch_results_by_ids(
top_ids, top_scores, return_full_content, category
)
# Apply final limit after category filtering
return results[:top_k]
def _search_brute_force(
self,
@@ -884,6 +958,7 @@ class VectorStore:
top_k: int,
min_score: float,
return_full_content: bool,
category: Optional[str] = None,
) -> List[SearchResult]:
"""Brute-force search using NumPy (O(N) fallback).
@@ -892,6 +967,7 @@ class VectorStore:
top_k: Maximum results to return
min_score: Minimum cosine similarity score in [0.0, 1.0]
return_full_content: If True, return full code block content
category: Optional category filter ('code' or 'doc')
Returns:
List of SearchResult ordered by similarity (highest first)
@@ -926,27 +1002,31 @@ class VectorStore:
if len(valid_indices) == 0:
return []
# Sort by score descending and take top_k
# When category filtering, fetch more candidates to compensate for filtering
fetch_k = top_k * 3 if category else top_k
# Sort by score descending and take top candidates
valid_scores = scores[valid_indices]
sorted_order = np.argsort(valid_scores)[::-1][:top_k]
sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]
top_indices = valid_indices[sorted_order]
top_scores = valid_scores[sorted_order]
# Get chunk IDs for top results
top_ids = [self._chunk_ids[i] for i in top_indices]
# Fetch content only for top-k results (lazy loading)
# Fetch content only for top-k results (lazy loading) with category filtering
results = self._fetch_results_by_ids(
top_ids, top_scores.tolist(), return_full_content
top_ids, top_scores.tolist(), return_full_content, category
)
return results
# Apply final limit after category filtering
return results[:top_k]
def _fetch_results_by_ids(
self,
chunk_ids: List[int],
scores: List[float],
return_full_content: bool,
category: Optional[str] = None,
) -> List[SearchResult]:
"""Fetch full result data for specific chunk IDs.
@@ -954,6 +1034,7 @@ class VectorStore:
chunk_ids: List of chunk IDs to fetch.
scores: Corresponding similarity scores.
return_full_content: Whether to include full content.
category: Optional category filter ('code' or 'doc').
Returns:
List of SearchResult objects.
@@ -968,15 +1049,25 @@ class VectorStore:
# SQL injection prevention:
# - Only a validated placeholders string (commas + '?') is interpolated into the query.
# - User-provided values are passed separately via sqlite3 parameters.
query = """
SELECT id, file_path, content, metadata
FROM semantic_chunks
WHERE id IN ({placeholders})
""".format(placeholders=placeholders)
# - Category filter is added as a separate parameter
if category:
query = """
SELECT id, file_path, content, metadata
FROM semantic_chunks
WHERE id IN ({placeholders}) AND category = ?
""".format(placeholders=placeholders)
params = list(chunk_ids) + [category]
else:
query = """
SELECT id, file_path, content, metadata
FROM semantic_chunks
WHERE id IN ({placeholders})
""".format(placeholders=placeholders)
params = chunk_ids
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA mmap_size = 30000000000")
rows = conn.execute(query, chunk_ids).fetchall()
rows = conn.execute(query, params).fetchall()
# Build ID -> row mapping
id_to_row = {r[0]: r for r in rows}

View File

@@ -30,8 +30,22 @@ class WatcherConfig:
"""Configuration for file watcher."""
debounce_ms: int = 1000
ignored_patterns: Set[str] = field(default_factory=lambda: {
".git", ".venv", "venv", "node_modules",
"__pycache__", ".codexlens", ".idea", ".vscode",
# Version control
".git", ".svn", ".hg",
# Python environments & cache
".venv", "venv", "env", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache",
# Node.js
"node_modules", "bower_components", ".npm", ".yarn",
# Build artifacts
"dist", "build", "out", "target", "bin", "obj", "_build", "coverage", "htmlcov",
# IDE & Editor
".idea", ".vscode", ".vs", ".eclipse",
# CodexLens internal
".codexlens",
# Package manager caches
".cache", ".parcel-cache", ".turbo", ".next", ".nuxt",
# Logs & temp
"logs", "tmp", "temp",
})
languages: Optional[List[str]] = None # None = all supported