mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-13 02:41:50 +08:00
Enhance semantic search capabilities and configuration
- Added category support for programming and documentation languages in Config. - Implemented category-based filtering in HybridSearchEngine to improve search relevance based on query intent. - Introduced functions for filtering results by category and determining file categories based on extensions. - Updated VectorStore to include a category column in the database schema and modified chunk addition methods to support category tagging. - Enhanced the WatcherConfig to ignore additional common directories and files. - Created a benchmark script to compare performance between Binary Cascade, SPLADE, and Vector semantic search methods, including detailed result analysis and overlap comparison.
This commit is contained in:
489
codex-lens/benchmarks/compare_semantic_methods.py
Normal file
489
codex-lens/benchmarks/compare_semantic_methods.py
Normal file
@@ -0,0 +1,489 @@
|
|||||||
|
"""Compare Binary Cascade, SPLADE, and Vector semantic search methods.
|
||||||
|
|
||||||
|
This script compares the three semantic retrieval approaches:
|
||||||
|
1. Binary Cascade: 256-bit binary vectors for coarse ranking
|
||||||
|
2. SPLADE: Sparse learned representations with inverted index
|
||||||
|
3. Vector Dense: Full semantic embeddings with cosine similarity
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from codexlens.storage.dir_index import DirIndexStore
|
||||||
|
from codexlens.storage.splade_index import SpladeIndex
|
||||||
|
from codexlens.semantic.vector_store import VectorStore
|
||||||
|
|
||||||
|
|
||||||
|
def get_filename(path: str) -> str:
|
||||||
|
"""Extract filename from path."""
|
||||||
|
if "\\" in path:
|
||||||
|
return path.split("\\")[-1]
|
||||||
|
elif "/" in path:
|
||||||
|
return path.split("/")[-1]
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def find_splade_db(index_root: Path) -> Path:
|
||||||
|
"""Find SPLADE database by searching directory tree."""
|
||||||
|
# Check root first
|
||||||
|
if (index_root / "_splade.db").exists():
|
||||||
|
return index_root / "_splade.db"
|
||||||
|
|
||||||
|
# Search in subdirectories
|
||||||
|
for splade_db in index_root.rglob("_splade.db"):
|
||||||
|
return splade_db
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_binary_indexes(index_root: Path):
|
||||||
|
"""Find all binary index files."""
|
||||||
|
return list(index_root.rglob("_index_binary_vectors.bin"))
|
||||||
|
|
||||||
|
|
||||||
|
# Test queries for semantic search comparison
|
||||||
|
TEST_QUERIES = [
|
||||||
|
"how to search code semantically",
|
||||||
|
"embedding generation for files",
|
||||||
|
"hybrid search with multiple backends",
|
||||||
|
"parse python source code",
|
||||||
|
"database storage for vectors",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Index paths
|
||||||
|
INDEX_ROOT = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
|
||||||
|
|
||||||
|
|
||||||
|
def test_vector_search(query: str, limit: int = 10):
|
||||||
|
"""Test dense vector search."""
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.factory import get_embedder
|
||||||
|
|
||||||
|
# Find an index with embeddings
|
||||||
|
all_results = []
|
||||||
|
total_time = 0
|
||||||
|
|
||||||
|
for index_db in INDEX_ROOT.rglob("_index.db"):
|
||||||
|
vector_store = VectorStore(index_db)
|
||||||
|
|
||||||
|
if vector_store.count_chunks() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get embedder based on stored config
|
||||||
|
model_config = vector_store.get_model_config()
|
||||||
|
if model_config:
|
||||||
|
backend = model_config.get("backend", "fastembed")
|
||||||
|
model_name = model_config["model_name"]
|
||||||
|
model_profile = model_config["model_profile"]
|
||||||
|
if backend == "litellm":
|
||||||
|
embedder = get_embedder(backend="litellm", model=model_name)
|
||||||
|
else:
|
||||||
|
embedder = get_embedder(backend="fastembed", profile=model_profile)
|
||||||
|
else:
|
||||||
|
embedder = get_embedder(backend="fastembed", profile="code")
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
query_embedding = embedder.embed_single(query)
|
||||||
|
results = vector_store.search_similar(
|
||||||
|
query_embedding=query_embedding,
|
||||||
|
top_k=limit,
|
||||||
|
min_score=0.0,
|
||||||
|
return_full_content=True,
|
||||||
|
)
|
||||||
|
total_time += (time.perf_counter() - start) * 1000
|
||||||
|
all_results.extend(results)
|
||||||
|
|
||||||
|
# Only need one successful search to get embedder initialized
|
||||||
|
if results:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Sort by score and limit
|
||||||
|
all_results.sort(key=lambda x: x.score, reverse=True)
|
||||||
|
return all_results[:limit], total_time, None
|
||||||
|
except Exception as e:
|
||||||
|
return [], 0, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def test_splade_search(query: str, limit: int = 10):
|
||||||
|
"""Test SPLADE sparse search."""
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
|
||||||
|
|
||||||
|
ok, err = check_splade_available()
|
||||||
|
if not ok:
|
||||||
|
return [], 0, f"SPLADE not available: {err}"
|
||||||
|
|
||||||
|
splade_db_path = find_splade_db(INDEX_ROOT)
|
||||||
|
if not splade_db_path:
|
||||||
|
return [], 0, "SPLADE database not found"
|
||||||
|
|
||||||
|
splade_index = SpladeIndex(splade_db_path)
|
||||||
|
if not splade_index.has_index():
|
||||||
|
return [], 0, "SPLADE index not initialized"
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
encoder = get_splade_encoder()
|
||||||
|
query_sparse = encoder.encode_text(query)
|
||||||
|
raw_results = splade_index.search(query_sparse, limit=limit, min_score=0.0)
|
||||||
|
|
||||||
|
if not raw_results:
|
||||||
|
elapsed = (time.perf_counter() - start) * 1000
|
||||||
|
return [], elapsed, None
|
||||||
|
|
||||||
|
# Get chunk details
|
||||||
|
chunk_ids = [chunk_id for chunk_id, _ in raw_results]
|
||||||
|
score_map = {chunk_id: score for chunk_id, score in raw_results}
|
||||||
|
rows = splade_index.get_chunks_by_ids(chunk_ids)
|
||||||
|
|
||||||
|
elapsed = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
# Build result objects
|
||||||
|
results = []
|
||||||
|
for row in rows:
|
||||||
|
chunk_id = row["id"]
|
||||||
|
results.append({
|
||||||
|
"path": row["file_path"],
|
||||||
|
"score": score_map.get(chunk_id, 0.0),
|
||||||
|
"content": row["content"][:200] + "..." if len(row["content"]) > 200 else row["content"],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by score
|
||||||
|
results.sort(key=lambda x: x["score"], reverse=True)
|
||||||
|
return results, elapsed, None
|
||||||
|
except Exception as e:
|
||||||
|
return [], 0, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def test_binary_cascade_search(query: str, limit: int = 10):
|
||||||
|
"""Test binary cascade search (binary coarse + dense fine ranking)."""
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
||||||
|
from codexlens.indexing.embedding import CascadeEmbeddingBackend
|
||||||
|
import numpy as np
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
# Find binary indexes
|
||||||
|
binary_indexes = find_binary_indexes(INDEX_ROOT)
|
||||||
|
if not binary_indexes:
|
||||||
|
return [], 0, "No binary indexes found. Run 'codexlens cascade-index' first."
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
# Initialize cascade backend for query encoding
|
||||||
|
cascade_backend = CascadeEmbeddingBackend()
|
||||||
|
|
||||||
|
# Encode query to binary and dense
|
||||||
|
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade([query], batch_size=1)
|
||||||
|
query_binary = binary_embeddings[0]
|
||||||
|
query_dense = dense_embeddings[0]
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for binary_index_path in binary_indexes:
|
||||||
|
# Find corresponding index.db
|
||||||
|
index_db = binary_index_path.parent / "_index.db"
|
||||||
|
if not index_db.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if cascade embeddings exist
|
||||||
|
conn = sqlite3.connect(index_db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
|
||||||
|
)
|
||||||
|
binary_count = cursor.fetchone()[0]
|
||||||
|
if binary_count == 0:
|
||||||
|
conn.close()
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Stage 1: Binary coarse search
|
||||||
|
binary_index = BinaryANNIndex(index_db, dim=256)
|
||||||
|
try:
|
||||||
|
binary_index.load()
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pack query for binary search
|
||||||
|
from codexlens.indexing.embedding import pack_binary_embedding
|
||||||
|
query_binary_packed = pack_binary_embedding(query_binary)
|
||||||
|
|
||||||
|
# Get top candidates
|
||||||
|
coarse_limit = min(limit * 10, 100)
|
||||||
|
# search returns (ids, distances) tuple
|
||||||
|
coarse_ids, coarse_distances = binary_index.search(query_binary_packed, top_k=coarse_limit)
|
||||||
|
|
||||||
|
if not coarse_ids:
|
||||||
|
conn.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Stage 2: Dense reranking
|
||||||
|
chunk_ids = coarse_ids
|
||||||
|
placeholders = ",".join("?" * len(chunk_ids))
|
||||||
|
|
||||||
|
cursor = conn.execute(
|
||||||
|
f"""
|
||||||
|
SELECT id, file_path, content, embedding_dense
|
||||||
|
FROM semantic_chunks
|
||||||
|
WHERE id IN ({placeholders}) AND embedding_dense IS NOT NULL
|
||||||
|
""",
|
||||||
|
chunk_ids
|
||||||
|
)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
|
# Compute dense scores
|
||||||
|
for row in rows:
|
||||||
|
chunk_id = row["id"]
|
||||||
|
file_path = row["file_path"]
|
||||||
|
content = row["content"]
|
||||||
|
dense_blob = row["embedding_dense"]
|
||||||
|
|
||||||
|
if dense_blob:
|
||||||
|
dense_vec = np.frombuffer(dense_blob, dtype=np.float32)
|
||||||
|
# Cosine similarity
|
||||||
|
score = float(np.dot(query_dense, dense_vec) / (
|
||||||
|
np.linalg.norm(query_dense) * np.linalg.norm(dense_vec) + 1e-8
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
score = 0.0
|
||||||
|
|
||||||
|
all_results.append({
|
||||||
|
"path": file_path,
|
||||||
|
"score": score,
|
||||||
|
"content": content[:200] + "..." if len(content) > 200 else content,
|
||||||
|
})
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Sort by dense score and limit
|
||||||
|
all_results.sort(key=lambda x: x["score"], reverse=True)
|
||||||
|
final_results = all_results[:limit]
|
||||||
|
|
||||||
|
elapsed = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
return final_results, elapsed, None
|
||||||
|
except ImportError as e:
|
||||||
|
return [], 0, f"Import error: {e}"
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
return [], 0, f"{str(e)}\n{traceback.format_exc()}"
|
||||||
|
|
||||||
|
|
||||||
|
def print_results(method_name: str, results, elapsed: float, error: str = None):
|
||||||
|
"""Print search results in a formatted way."""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Method: {method_name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
if error:
|
||||||
|
print(f"ERROR: {error}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Results: {len(results)}, Time: {elapsed:.1f}ms")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for i, r in enumerate(results[:5], 1):
|
||||||
|
if isinstance(r, dict):
|
||||||
|
path = r.get("path", "?")
|
||||||
|
score = r.get("score", 0)
|
||||||
|
content = r.get("content", "")[:80]
|
||||||
|
else:
|
||||||
|
path = getattr(r, "path", "?")
|
||||||
|
score = getattr(r, "score", 0)
|
||||||
|
content = getattr(r, "content", "")[:80] if hasattr(r, "content") else ""
|
||||||
|
|
||||||
|
filename = get_filename(path)
|
||||||
|
print(f" {i}. [{score:.4f}] {filename}")
|
||||||
|
if content:
|
||||||
|
# Sanitize content for console output
|
||||||
|
safe_content = content.encode('ascii', 'replace').decode('ascii')
|
||||||
|
print(f" {safe_content}...")
|
||||||
|
|
||||||
|
|
||||||
|
def compare_overlap(results1, results2, name1: str, name2: str):
|
||||||
|
"""Compare result overlap between two methods."""
|
||||||
|
def get_paths(results):
|
||||||
|
paths = set()
|
||||||
|
for r in results[:10]:
|
||||||
|
if isinstance(r, dict):
|
||||||
|
paths.add(r.get("path", ""))
|
||||||
|
else:
|
||||||
|
paths.add(getattr(r, "path", ""))
|
||||||
|
return paths
|
||||||
|
|
||||||
|
paths1 = get_paths(results1)
|
||||||
|
paths2 = get_paths(results2)
|
||||||
|
|
||||||
|
if not paths1 or not paths2:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
overlap = len(paths1 & paths2)
|
||||||
|
union = len(paths1 | paths2)
|
||||||
|
jaccard = overlap / union if union > 0 else 0.0
|
||||||
|
|
||||||
|
print(f" {name1} vs {name2}: {overlap} common files (Jaccard: {jaccard:.2f})")
|
||||||
|
return jaccard
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 70)
|
||||||
|
print("SEMANTIC SEARCH METHODS COMPARISON")
|
||||||
|
print("Binary Cascade vs SPLADE vs Vector Dense")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Check prerequisites
|
||||||
|
print("\n[Prerequisites Check]")
|
||||||
|
print(f" Index Root: {INDEX_ROOT}")
|
||||||
|
|
||||||
|
splade_db = find_splade_db(INDEX_ROOT)
|
||||||
|
print(f" SPLADE DB: {splade_db} - {'EXISTS' if splade_db else 'NOT FOUND'}")
|
||||||
|
|
||||||
|
binary_indexes = find_binary_indexes(INDEX_ROOT)
|
||||||
|
print(f" Binary Indexes: {len(binary_indexes)} found")
|
||||||
|
for bi in binary_indexes[:3]:
|
||||||
|
print(f" - {bi.parent.name}/{bi.name}")
|
||||||
|
if len(binary_indexes) > 3:
|
||||||
|
print(f" ... and {len(binary_indexes) - 3} more")
|
||||||
|
|
||||||
|
# Aggregate statistics
|
||||||
|
all_results = {
|
||||||
|
"binary": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||||
|
"splade": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||||
|
"vector": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
|
||||||
|
}
|
||||||
|
|
||||||
|
overlap_scores = {"binary_splade": [], "binary_vector": [], "splade_vector": []}
|
||||||
|
|
||||||
|
for query in TEST_QUERIES:
|
||||||
|
print(f"\n{'#'*70}")
|
||||||
|
print(f"QUERY: \"{query}\"")
|
||||||
|
print("#" * 70)
|
||||||
|
|
||||||
|
# Test each method
|
||||||
|
binary_results, binary_time, binary_err = test_binary_cascade_search(query)
|
||||||
|
splade_results, splade_time, splade_err = test_splade_search(query)
|
||||||
|
vector_results, vector_time, vector_err = test_vector_search(query)
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print_results("Binary Cascade (256-bit + Dense Rerank)", binary_results, binary_time, binary_err)
|
||||||
|
print_results("SPLADE (Sparse Learned)", splade_results, splade_time, splade_err)
|
||||||
|
print_results("Vector Dense (Semantic Embeddings)", vector_results, vector_time, vector_err)
|
||||||
|
|
||||||
|
# Update statistics
|
||||||
|
if not binary_err:
|
||||||
|
all_results["binary"]["total_results"] += len(binary_results)
|
||||||
|
all_results["binary"]["total_time"] += binary_time
|
||||||
|
all_results["binary"]["queries"] += 1
|
||||||
|
else:
|
||||||
|
all_results["binary"]["errors"].append(binary_err)
|
||||||
|
|
||||||
|
if not splade_err:
|
||||||
|
all_results["splade"]["total_results"] += len(splade_results)
|
||||||
|
all_results["splade"]["total_time"] += splade_time
|
||||||
|
all_results["splade"]["queries"] += 1
|
||||||
|
else:
|
||||||
|
all_results["splade"]["errors"].append(splade_err)
|
||||||
|
|
||||||
|
if not vector_err:
|
||||||
|
all_results["vector"]["total_results"] += len(vector_results)
|
||||||
|
all_results["vector"]["total_time"] += vector_time
|
||||||
|
all_results["vector"]["queries"] += 1
|
||||||
|
else:
|
||||||
|
all_results["vector"]["errors"].append(vector_err)
|
||||||
|
|
||||||
|
# Compare overlap
|
||||||
|
print("\n[Result Overlap Analysis]")
|
||||||
|
if binary_results and splade_results:
|
||||||
|
j = compare_overlap(binary_results, splade_results, "Binary", "SPLADE")
|
||||||
|
overlap_scores["binary_splade"].append(j)
|
||||||
|
if binary_results and vector_results:
|
||||||
|
j = compare_overlap(binary_results, vector_results, "Binary", "Vector")
|
||||||
|
overlap_scores["binary_vector"].append(j)
|
||||||
|
if splade_results and vector_results:
|
||||||
|
j = compare_overlap(splade_results, vector_results, "SPLADE", "Vector")
|
||||||
|
overlap_scores["splade_vector"].append(j)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("SUMMARY STATISTICS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for method, stats in all_results.items():
|
||||||
|
queries = stats["queries"]
|
||||||
|
if queries > 0:
|
||||||
|
avg_results = stats["total_results"] / queries
|
||||||
|
avg_time = stats["total_time"] / queries
|
||||||
|
print(f"\n{method.upper()}:")
|
||||||
|
print(f" Successful queries: {queries}/{len(TEST_QUERIES)}")
|
||||||
|
print(f" Avg results: {avg_results:.1f}")
|
||||||
|
print(f" Avg time: {avg_time:.1f}ms")
|
||||||
|
else:
|
||||||
|
print(f"\n{method.upper()}: No successful queries")
|
||||||
|
if stats["errors"]:
|
||||||
|
# Show truncated error
|
||||||
|
err = stats["errors"][0]
|
||||||
|
if len(err) > 200:
|
||||||
|
err = err[:200] + "..."
|
||||||
|
print(f" Error: {err}")
|
||||||
|
|
||||||
|
print("\n[Average Overlap Scores]")
|
||||||
|
for pair, scores in overlap_scores.items():
|
||||||
|
if scores:
|
||||||
|
avg = sum(scores) / len(scores)
|
||||||
|
print(f" {pair}: {avg:.3f}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("ANALYSIS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Analyze working methods
|
||||||
|
working_methods = [m for m, s in all_results.items() if s["queries"] > 0]
|
||||||
|
|
||||||
|
if len(working_methods) == 3:
|
||||||
|
# All methods working - compare quality
|
||||||
|
print("\nAll three methods working. Quality comparison:")
|
||||||
|
|
||||||
|
# Compare avg results
|
||||||
|
print("\n Result Coverage (higher = more recall):")
|
||||||
|
for m in ["vector", "splade", "binary"]:
|
||||||
|
stats = all_results[m]
|
||||||
|
if stats["queries"] > 0:
|
||||||
|
avg = stats["total_results"] / stats["queries"]
|
||||||
|
print(f" {m.upper()}: {avg:.1f} results/query")
|
||||||
|
|
||||||
|
# Compare speed
|
||||||
|
print("\n Speed (lower = faster):")
|
||||||
|
for m in ["binary", "splade", "vector"]:
|
||||||
|
stats = all_results[m]
|
||||||
|
if stats["queries"] > 0:
|
||||||
|
avg = stats["total_time"] / stats["queries"]
|
||||||
|
print(f" {m.upper()}: {avg:.1f}ms")
|
||||||
|
|
||||||
|
# Recommend fusion strategy
|
||||||
|
print("\n Recommended Fusion Strategy:")
|
||||||
|
print(" For quality-focused hybrid search:")
|
||||||
|
print(" 1. Run all three in parallel")
|
||||||
|
print(" 2. Use RRF fusion with weights:")
|
||||||
|
print(" - Vector: 0.4 (best semantic understanding)")
|
||||||
|
print(" - SPLADE: 0.35 (learned sparse representations)")
|
||||||
|
print(" - Binary: 0.25 (fast coarse filtering)")
|
||||||
|
print(" 3. Apply CrossEncoder reranking on top-50")
|
||||||
|
|
||||||
|
elif len(working_methods) >= 2:
|
||||||
|
print(f"\n{len(working_methods)} methods working: {', '.join(working_methods)}")
|
||||||
|
print("Consider fixing missing method for complete hybrid search.")
|
||||||
|
else:
|
||||||
|
print(f"\nOnly {working_methods[0] if working_methods else 'no'} method(s) working.")
|
||||||
|
print("Check your index setup.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -17,6 +17,20 @@ except ImportError:
|
|||||||
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
|
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
|
||||||
return False, "codexlens.semantic not available"
|
return False, "codexlens.semantic not available"
|
||||||
|
|
||||||
|
try:
|
||||||
|
from codexlens.search.ranking import get_file_category
|
||||||
|
except ImportError:
|
||||||
|
def get_file_category(path: str): # type: ignore[no-redef]
|
||||||
|
"""Fallback: map common extensions to category."""
|
||||||
|
ext = Path(path).suffix.lower()
|
||||||
|
code_exts = {".py", ".js", ".jsx", ".ts", ".tsx", ".java", ".go", ".c", ".cpp", ".rs"}
|
||||||
|
doc_exts = {".md", ".mdx", ".txt", ".rst"}
|
||||||
|
if ext in code_exts:
|
||||||
|
return "code"
|
||||||
|
elif ext in doc_exts:
|
||||||
|
return "doc"
|
||||||
|
return None
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Embedding batch size - larger values improve throughput on modern hardware
|
# Embedding batch size - larger values improve throughput on modern hardware
|
||||||
@@ -24,6 +38,22 @@ logger = logging.getLogger(__name__)
|
|||||||
EMBEDDING_BATCH_SIZE = 256
|
EMBEDDING_BATCH_SIZE = 256
|
||||||
|
|
||||||
|
|
||||||
|
def _build_categories_from_batch(chunk_batch: List[Tuple[Any, str]]) -> List[str]:
|
||||||
|
"""Build categories list from chunk batch for index-level category filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk_batch: List of (chunk, file_path) tuples
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of category strings ('code' or 'doc'), defaulting to 'code' for unknown
|
||||||
|
"""
|
||||||
|
categories = []
|
||||||
|
for _, file_path in chunk_batch:
|
||||||
|
cat = get_file_category(file_path)
|
||||||
|
categories.append(cat if cat else "code") # Default to 'code' for unknown extensions
|
||||||
|
return categories
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_fastembed_resources() -> None:
|
def _cleanup_fastembed_resources() -> None:
|
||||||
"""Best-effort cleanup for fastembed/ONNX resources (no-op for other backends)."""
|
"""Best-effort cleanup for fastembed/ONNX resources (no-op for other backends)."""
|
||||||
try:
|
try:
|
||||||
@@ -577,8 +607,9 @@ def generate_embeddings(
|
|||||||
batch_contents = [chunk.content for chunk, _ in chunk_batch]
|
batch_contents = [chunk.content for chunk, _ in chunk_batch]
|
||||||
embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
|
embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
|
||||||
|
|
||||||
# Store embeddings
|
# Store embeddings with category
|
||||||
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
|
categories = _build_categories_from_batch(chunk_batch)
|
||||||
|
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
|
||||||
|
|
||||||
files_seen.update(batch_files)
|
files_seen.update(batch_files)
|
||||||
total_chunks_created += len(chunk_batch)
|
total_chunks_created += len(chunk_batch)
|
||||||
@@ -630,7 +661,8 @@ def generate_embeddings(
|
|||||||
batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result()
|
batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result()
|
||||||
if embeddings_numpy is not None and error is None:
|
if embeddings_numpy is not None and error is None:
|
||||||
# Write to DB in main thread (no contention)
|
# Write to DB in main thread (no contention)
|
||||||
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
|
categories = _build_categories_from_batch(chunk_batch)
|
||||||
|
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
|
||||||
total_chunks_created += len(chunk_batch)
|
total_chunks_created += len(chunk_batch)
|
||||||
files_seen.update(batch_files)
|
files_seen.update(batch_files)
|
||||||
total_files_processed = len(files_seen)
|
total_files_processed = len(files_seen)
|
||||||
@@ -667,7 +699,8 @@ def generate_embeddings(
|
|||||||
try:
|
try:
|
||||||
batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result()
|
batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result()
|
||||||
if embeddings_numpy is not None and error is None:
|
if embeddings_numpy is not None and error is None:
|
||||||
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
|
categories = _build_categories_from_batch(chunk_batch)
|
||||||
|
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories)
|
||||||
total_chunks_created += len(chunk_batch)
|
total_chunks_created += len(chunk_batch)
|
||||||
files_seen.update(batch_files)
|
files_seen.update(batch_files)
|
||||||
total_files_processed = len(files_seen)
|
total_files_processed = len(files_seen)
|
||||||
|
|||||||
@@ -67,15 +67,21 @@ class Config:
|
|||||||
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
|
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
|
||||||
supported_languages: Dict[str, Dict[str, Any]] = field(
|
supported_languages: Dict[str, Dict[str, Any]] = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"python": {"extensions": [".py"], "tree_sitter_language": "python"},
|
# Source code languages (category: "code")
|
||||||
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript"},
|
"python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"},
|
||||||
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript"},
|
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"},
|
||||||
"java": {"extensions": [".java"], "tree_sitter_language": "java"},
|
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"},
|
||||||
"go": {"extensions": [".go"], "tree_sitter_language": "go"},
|
"java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"},
|
||||||
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig"},
|
"go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"},
|
||||||
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc"},
|
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"},
|
||||||
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None},
|
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"},
|
||||||
"text": {"extensions": [".txt"], "tree_sitter_language": None},
|
"c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"},
|
||||||
|
"cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"},
|
||||||
|
"rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"},
|
||||||
|
# Documentation languages (category: "doc")
|
||||||
|
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None, "category": "doc"},
|
||||||
|
"text": {"extensions": [".txt"], "tree_sitter_language": None, "category": "doc"},
|
||||||
|
"rst": {"extensions": [".rst"], "tree_sitter_language": None, "category": "doc"},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
parsing_rules: Dict[str, Dict[str, Any]] = field(
|
parsing_rules: Dict[str, Dict[str, Any]] = field(
|
||||||
@@ -141,6 +147,9 @@ class Config:
|
|||||||
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
|
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
|
||||||
rrf_k: int = 60 # RRF constant (default 60)
|
rrf_k: int = 60 # RRF constant (default 60)
|
||||||
|
|
||||||
|
# Category-based filtering to separate code/doc results
|
||||||
|
enable_category_filter: bool = True # Enable code/doc result separation
|
||||||
|
|
||||||
# Multi-endpoint configuration for litellm backend
|
# Multi-endpoint configuration for litellm backend
|
||||||
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
|
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
|
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
|
||||||
@@ -210,6 +219,14 @@ class Config:
|
|||||||
return language_id
|
return language_id
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def category_for_path(self, path: str | Path) -> str | None:
|
||||||
|
"""Get file category ('code' or 'doc') from a file path."""
|
||||||
|
language = self.language_for_path(path)
|
||||||
|
if language is None:
|
||||||
|
return None
|
||||||
|
spec = self.supported_languages.get(language, {})
|
||||||
|
return spec.get("category")
|
||||||
|
|
||||||
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
|
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
|
||||||
"""Get parsing rules for a specific language, falling back to defaults."""
|
"""Get parsing rules for a specific language, falling back to defaults."""
|
||||||
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}
|
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}
|
||||||
|
|||||||
@@ -35,8 +35,11 @@ from codexlens.entities import SearchResult
|
|||||||
from codexlens.search.ranking import (
|
from codexlens.search.ranking import (
|
||||||
DEFAULT_WEIGHTS,
|
DEFAULT_WEIGHTS,
|
||||||
FTS_FALLBACK_WEIGHTS,
|
FTS_FALLBACK_WEIGHTS,
|
||||||
|
QueryIntent,
|
||||||
apply_symbol_boost,
|
apply_symbol_boost,
|
||||||
cross_encoder_rerank,
|
cross_encoder_rerank,
|
||||||
|
detect_query_intent,
|
||||||
|
filter_results_by_category,
|
||||||
get_rrf_weights,
|
get_rrf_weights,
|
||||||
reciprocal_rank_fusion,
|
reciprocal_rank_fusion,
|
||||||
rerank_results,
|
rerank_results,
|
||||||
@@ -131,6 +134,16 @@ class HybridSearchEngine:
|
|||||||
except OSError:
|
except OSError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Detect query intent early for category filtering at index level
|
||||||
|
query_intent = detect_query_intent(query)
|
||||||
|
# Map intent to category for vector search:
|
||||||
|
# - KEYWORD (code intent) -> filter to 'code' only
|
||||||
|
# - SEMANTIC (doc intent) -> no filter (allow docs to surface)
|
||||||
|
# - MIXED -> no filter (allow all)
|
||||||
|
vector_category: Optional[str] = None
|
||||||
|
if query_intent == QueryIntent.KEYWORD:
|
||||||
|
vector_category = "code"
|
||||||
|
|
||||||
# Determine which backends to use
|
# Determine which backends to use
|
||||||
backends = {}
|
backends = {}
|
||||||
|
|
||||||
@@ -183,7 +196,7 @@ class HybridSearchEngine:
|
|||||||
|
|
||||||
# Execute parallel searches
|
# Execute parallel searches
|
||||||
with timer("parallel_search_total", self.logger):
|
with timer("parallel_search_total", self.logger):
|
||||||
results_map = self._search_parallel(index_path, query, backends, limit)
|
results_map = self._search_parallel(index_path, query, backends, limit, vector_category)
|
||||||
|
|
||||||
# Provide helpful message if pure-vector mode returns no results
|
# Provide helpful message if pure-vector mode returns no results
|
||||||
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
||||||
@@ -263,6 +276,19 @@ class HybridSearchEngine:
|
|||||||
top_k=self._config.reranker_top_k,
|
top_k=self._config.reranker_top_k,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Apply category filtering to avoid code/doc pollution
|
||||||
|
# This ensures KEYWORD queries return code files, SEMANTIC queries prefer docs
|
||||||
|
enable_category_filter = (
|
||||||
|
self._config is None
|
||||||
|
or getattr(self._config, 'enable_category_filter', True)
|
||||||
|
)
|
||||||
|
if enable_category_filter and not pure_vector:
|
||||||
|
with timer("category_filter", self.logger):
|
||||||
|
query_intent = detect_query_intent(query)
|
||||||
|
fused_results = filter_results_by_category(
|
||||||
|
fused_results, query_intent, allow_mixed=True
|
||||||
|
)
|
||||||
|
|
||||||
# Apply final limit
|
# Apply final limit
|
||||||
return fused_results[:limit]
|
return fused_results[:limit]
|
||||||
|
|
||||||
@@ -361,6 +387,7 @@ class HybridSearchEngine:
|
|||||||
query: str,
|
query: str,
|
||||||
backends: Dict[str, bool],
|
backends: Dict[str, bool],
|
||||||
limit: int,
|
limit: int,
|
||||||
|
category: Optional[str] = None,
|
||||||
) -> Dict[str, List[SearchResult]]:
|
) -> Dict[str, List[SearchResult]]:
|
||||||
"""Execute parallel searches across enabled backends.
|
"""Execute parallel searches across enabled backends.
|
||||||
|
|
||||||
@@ -369,6 +396,7 @@ class HybridSearchEngine:
|
|||||||
query: FTS5 query string
|
query: FTS5 query string
|
||||||
backends: Dictionary of backend name to enabled flag
|
backends: Dictionary of backend name to enabled flag
|
||||||
limit: Results limit per backend
|
limit: Results limit per backend
|
||||||
|
category: Optional category filter for vector search ('code' or 'doc')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping source name to results list
|
Dictionary mapping source name to results list
|
||||||
@@ -399,7 +427,7 @@ class HybridSearchEngine:
|
|||||||
if backends.get("vector"):
|
if backends.get("vector"):
|
||||||
submit_times["vector"] = time.perf_counter()
|
submit_times["vector"] = time.perf_counter()
|
||||||
future = executor.submit(
|
future = executor.submit(
|
||||||
self._search_vector, index_path, query, limit
|
self._search_vector, index_path, query, limit, category
|
||||||
)
|
)
|
||||||
future_to_source[future] = "vector"
|
future_to_source[future] = "vector"
|
||||||
|
|
||||||
@@ -490,7 +518,7 @@ class HybridSearchEngine:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def _search_vector(
|
def _search_vector(
|
||||||
self, index_path: Path, query: str, limit: int
|
self, index_path: Path, query: str, limit: int, category: Optional[str] = None
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Execute vector similarity search using semantic embeddings.
|
"""Execute vector similarity search using semantic embeddings.
|
||||||
|
|
||||||
@@ -498,6 +526,7 @@ class HybridSearchEngine:
|
|||||||
index_path: Path to _index.db file
|
index_path: Path to _index.db file
|
||||||
query: Natural language query string
|
query: Natural language query string
|
||||||
limit: Maximum results
|
limit: Maximum results
|
||||||
|
category: Optional category filter ('code' or 'doc')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SearchResult objects ordered by semantic similarity
|
List of SearchResult objects ordered by semantic similarity
|
||||||
@@ -616,6 +645,7 @@ class HybridSearchEngine:
|
|||||||
top_k=limit,
|
top_k=limit,
|
||||||
min_score=0.0, # Return all results, let RRF handle filtering
|
min_score=0.0, # Return all results, let RRF handle filtering
|
||||||
return_full_content=True,
|
return_full_content=True,
|
||||||
|
category=category,
|
||||||
)
|
)
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"[TIMING] vector_similarity_search: %.2fms (%d results)",
|
"[TIMING] vector_similarity_search: %.2fms (%d results)",
|
||||||
|
|||||||
@@ -9,7 +9,8 @@ from __future__ import annotations
|
|||||||
import re
|
import re
|
||||||
import math
|
import math
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from codexlens.entities import SearchResult, AdditionalLocation
|
from codexlens.entities import SearchResult, AdditionalLocation
|
||||||
|
|
||||||
@@ -132,6 +133,83 @@ def get_rrf_weights(
|
|||||||
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
|
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
|
||||||
|
|
||||||
|
|
||||||
|
# File extensions to category mapping for fast lookup
|
||||||
|
_EXT_TO_CATEGORY: Dict[str, str] = {
|
||||||
|
# Code extensions
|
||||||
|
".py": "code", ".js": "code", ".jsx": "code", ".ts": "code", ".tsx": "code",
|
||||||
|
".java": "code", ".go": "code", ".zig": "code", ".m": "code", ".mm": "code",
|
||||||
|
".c": "code", ".h": "code", ".cc": "code", ".cpp": "code", ".hpp": "code", ".cxx": "code",
|
||||||
|
".rs": "code",
|
||||||
|
# Doc extensions
|
||||||
|
".md": "doc", ".mdx": "doc", ".txt": "doc", ".rst": "doc",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_category(path: str) -> Optional[str]:
|
||||||
|
"""Get file category ('code' or 'doc') from path extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: File path string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
'code', 'doc', or None if unknown
|
||||||
|
"""
|
||||||
|
ext = Path(path).suffix.lower()
|
||||||
|
return _EXT_TO_CATEGORY.get(ext)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_results_by_category(
|
||||||
|
results: List[SearchResult],
|
||||||
|
intent: QueryIntent,
|
||||||
|
allow_mixed: bool = True,
|
||||||
|
) -> List[SearchResult]:
|
||||||
|
"""Filter results by category based on query intent.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
- KEYWORD (code intent): Only return code files
|
||||||
|
- SEMANTIC (doc intent): Prefer docs, but allow code if allow_mixed=True
|
||||||
|
- MIXED: Return all results
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: List of SearchResult objects
|
||||||
|
intent: Query intent from detect_query_intent()
|
||||||
|
allow_mixed: If True, SEMANTIC intent includes code files with lower priority
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered and re-ranked list of SearchResult objects
|
||||||
|
"""
|
||||||
|
if not results or intent == QueryIntent.MIXED:
|
||||||
|
return results
|
||||||
|
|
||||||
|
code_results = []
|
||||||
|
doc_results = []
|
||||||
|
unknown_results = []
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
category = get_file_category(r.path)
|
||||||
|
if category == "code":
|
||||||
|
code_results.append(r)
|
||||||
|
elif category == "doc":
|
||||||
|
doc_results.append(r)
|
||||||
|
else:
|
||||||
|
unknown_results.append(r)
|
||||||
|
|
||||||
|
if intent == QueryIntent.KEYWORD:
|
||||||
|
# Code intent: return only code files + unknown (might be code)
|
||||||
|
filtered = code_results + unknown_results
|
||||||
|
elif intent == QueryIntent.SEMANTIC:
|
||||||
|
if allow_mixed:
|
||||||
|
# Semantic intent with mixed: docs first, then code
|
||||||
|
filtered = doc_results + code_results + unknown_results
|
||||||
|
else:
|
||||||
|
# Semantic intent strict: only docs
|
||||||
|
filtered = doc_results + unknown_results
|
||||||
|
else:
|
||||||
|
filtered = results
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
def simple_weighted_fusion(
|
def simple_weighted_fusion(
|
||||||
results_map: Dict[str, List[SearchResult]],
|
results_map: Dict[str, List[SearchResult]],
|
||||||
weights: Dict[str, float] = None,
|
weights: Dict[str, float] = None,
|
||||||
|
|||||||
@@ -155,6 +155,7 @@ class VectorStore:
|
|||||||
content TEXT NOT NULL,
|
content TEXT NOT NULL,
|
||||||
embedding BLOB NOT NULL,
|
embedding BLOB NOT NULL,
|
||||||
metadata TEXT,
|
metadata TEXT,
|
||||||
|
category TEXT DEFAULT 'code',
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
@@ -162,6 +163,10 @@ class VectorStore:
|
|||||||
CREATE INDEX IF NOT EXISTS idx_chunks_file
|
CREATE INDEX IF NOT EXISTS idx_chunks_file
|
||||||
ON semantic_chunks(file_path)
|
ON semantic_chunks(file_path)
|
||||||
""")
|
""")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chunks_category
|
||||||
|
ON semantic_chunks(category)
|
||||||
|
""")
|
||||||
# Model configuration table - tracks which model generated the embeddings
|
# Model configuration table - tracks which model generated the embeddings
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS embeddings_config (
|
CREATE TABLE IF NOT EXISTS embeddings_config (
|
||||||
@@ -177,6 +182,8 @@ class VectorStore:
|
|||||||
|
|
||||||
# Migration: Add backend column to existing tables
|
# Migration: Add backend column to existing tables
|
||||||
self._migrate_backend_column(conn)
|
self._migrate_backend_column(conn)
|
||||||
|
# Migration: Add category column
|
||||||
|
self._migrate_category_column(conn)
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
@@ -197,6 +204,28 @@ class VectorStore:
|
|||||||
ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'
|
ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
def _migrate_category_column(self, conn: sqlite3.Connection) -> None:
|
||||||
|
"""Add category column to existing semantic_chunks table if not present.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Active SQLite connection
|
||||||
|
"""
|
||||||
|
# Check if category column exists
|
||||||
|
cursor = conn.execute("PRAGMA table_info(semantic_chunks)")
|
||||||
|
columns = [row[1] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if 'category' not in columns:
|
||||||
|
logger.info("Migrating semantic_chunks table: adding category column")
|
||||||
|
conn.execute("""
|
||||||
|
ALTER TABLE semantic_chunks
|
||||||
|
ADD COLUMN category TEXT DEFAULT 'code'
|
||||||
|
""")
|
||||||
|
# Create index for fast category filtering
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chunks_category
|
||||||
|
ON semantic_chunks(category)
|
||||||
|
""")
|
||||||
|
|
||||||
def _init_ann_index(self) -> None:
|
def _init_ann_index(self) -> None:
|
||||||
"""Initialize ANN index (lazy loading from existing data)."""
|
"""Initialize ANN index (lazy loading from existing data)."""
|
||||||
if not HNSWLIB_AVAILABLE:
|
if not HNSWLIB_AVAILABLE:
|
||||||
@@ -390,9 +419,16 @@ class VectorStore:
|
|||||||
self._ann_index = None
|
self._ann_index = None
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
|
def add_chunk(
|
||||||
|
self, chunk: SemanticChunk, file_path: str, category: str = "code"
|
||||||
|
) -> int:
|
||||||
"""Add a single chunk with its embedding.
|
"""Add a single chunk with its embedding.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: SemanticChunk with embedding
|
||||||
|
file_path: Path to the source file
|
||||||
|
category: File category ('code' or 'doc'), default 'code'
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The inserted chunk ID.
|
The inserted chunk ID.
|
||||||
"""
|
"""
|
||||||
@@ -406,10 +442,10 @@ class VectorStore:
|
|||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
cursor = conn.execute(
|
cursor = conn.execute(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
(file_path, chunk.content, embedding_blob, metadata_json)
|
(file_path, chunk.content, embedding_blob, metadata_json, category)
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
chunk_id = cursor.lastrowid or 0
|
chunk_id = cursor.lastrowid or 0
|
||||||
@@ -427,9 +463,16 @@ class VectorStore:
|
|||||||
self._invalidate_cache()
|
self._invalidate_cache()
|
||||||
return chunk_id
|
return chunk_id
|
||||||
|
|
||||||
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
|
def add_chunks(
|
||||||
|
self, chunks: List[SemanticChunk], file_path: str, category: str = "code"
|
||||||
|
) -> List[int]:
|
||||||
"""Add multiple chunks with embeddings (batch insert).
|
"""Add multiple chunks with embeddings (batch insert).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: List of SemanticChunk objects with embeddings
|
||||||
|
file_path: Path to the source file
|
||||||
|
category: File category ('code' or 'doc'), default 'code'
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of inserted chunk IDs.
|
List of inserted chunk IDs.
|
||||||
"""
|
"""
|
||||||
@@ -445,7 +488,7 @@ class VectorStore:
|
|||||||
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
||||||
embedding_blob = embedding_arr.tobytes()
|
embedding_blob = embedding_arr.tobytes()
|
||||||
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
||||||
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
|
||||||
embeddings_list.append(embedding_arr)
|
embeddings_list.append(embedding_arr)
|
||||||
|
|
||||||
# Batch insert to SQLite
|
# Batch insert to SQLite
|
||||||
@@ -456,8 +499,8 @@ class VectorStore:
|
|||||||
|
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
batch_data
|
batch_data
|
||||||
)
|
)
|
||||||
@@ -484,6 +527,7 @@ class VectorStore:
|
|||||||
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
||||||
update_ann: bool = True,
|
update_ann: bool = True,
|
||||||
auto_save_ann: bool = True,
|
auto_save_ann: bool = True,
|
||||||
|
categories: Optional[List[str]] = None,
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""Batch insert chunks from multiple files in a single transaction.
|
"""Batch insert chunks from multiple files in a single transaction.
|
||||||
|
|
||||||
@@ -494,6 +538,8 @@ class VectorStore:
|
|||||||
update_ann: If True, update ANN index with new vectors (default: True)
|
update_ann: If True, update ANN index with new vectors (default: True)
|
||||||
auto_save_ann: If True, save ANN index after update (default: True).
|
auto_save_ann: If True, save ANN index after update (default: True).
|
||||||
Set to False for bulk inserts to reduce I/O overhead.
|
Set to False for bulk inserts to reduce I/O overhead.
|
||||||
|
categories: Optional list of categories per chunk. If None, defaults to 'code'.
|
||||||
|
If provided, must match length of chunks_with_paths.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of inserted chunk IDs
|
List of inserted chunk IDs
|
||||||
@@ -503,10 +549,17 @@ class VectorStore:
|
|||||||
|
|
||||||
batch_size = len(chunks_with_paths)
|
batch_size = len(chunks_with_paths)
|
||||||
|
|
||||||
|
# Validate categories if provided
|
||||||
|
if categories is not None and len(categories) != batch_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"categories length ({len(categories)}) must match "
|
||||||
|
f"chunks_with_paths length ({batch_size})"
|
||||||
|
)
|
||||||
|
|
||||||
# Prepare batch data
|
# Prepare batch data
|
||||||
batch_data = []
|
batch_data = []
|
||||||
embeddings_list = []
|
embeddings_list = []
|
||||||
for chunk, file_path in chunks_with_paths:
|
for i, (chunk, file_path) in enumerate(chunks_with_paths):
|
||||||
if chunk.embedding is None:
|
if chunk.embedding is None:
|
||||||
raise ValueError("All chunks must have embeddings")
|
raise ValueError("All chunks must have embeddings")
|
||||||
# Optimize: avoid repeated np.array() if already numpy
|
# Optimize: avoid repeated np.array() if already numpy
|
||||||
@@ -516,7 +569,8 @@ class VectorStore:
|
|||||||
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
||||||
embedding_blob = embedding_arr.tobytes()
|
embedding_blob = embedding_arr.tobytes()
|
||||||
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
||||||
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
category = categories[i] if categories else "code"
|
||||||
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
|
||||||
embeddings_list.append(embedding_arr)
|
embeddings_list.append(embedding_arr)
|
||||||
|
|
||||||
# Batch insert to SQLite in single transaction
|
# Batch insert to SQLite in single transaction
|
||||||
@@ -529,8 +583,8 @@ class VectorStore:
|
|||||||
|
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
batch_data
|
batch_data
|
||||||
)
|
)
|
||||||
@@ -565,6 +619,7 @@ class VectorStore:
|
|||||||
embeddings_matrix: np.ndarray,
|
embeddings_matrix: np.ndarray,
|
||||||
update_ann: bool = True,
|
update_ann: bool = True,
|
||||||
auto_save_ann: bool = True,
|
auto_save_ann: bool = True,
|
||||||
|
categories: Optional[List[str]] = None,
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""Batch insert chunks with pre-computed numpy embeddings matrix.
|
"""Batch insert chunks with pre-computed numpy embeddings matrix.
|
||||||
|
|
||||||
@@ -576,6 +631,7 @@ class VectorStore:
|
|||||||
embeddings_matrix: Pre-computed embeddings as (N, D) numpy array
|
embeddings_matrix: Pre-computed embeddings as (N, D) numpy array
|
||||||
update_ann: If True, update ANN index with new vectors (default: True)
|
update_ann: If True, update ANN index with new vectors (default: True)
|
||||||
auto_save_ann: If True, save ANN index after update (default: True)
|
auto_save_ann: If True, save ANN index after update (default: True)
|
||||||
|
categories: Optional list of categories per chunk. If None, defaults to 'code'.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of inserted chunk IDs
|
List of inserted chunk IDs
|
||||||
@@ -591,6 +647,13 @@ class VectorStore:
|
|||||||
f"{embeddings_matrix.shape[0]} embeddings"
|
f"{embeddings_matrix.shape[0]} embeddings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Validate categories if provided
|
||||||
|
if categories is not None and len(categories) != batch_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"categories length ({len(categories)}) must match "
|
||||||
|
f"chunks_with_paths length ({batch_size})"
|
||||||
|
)
|
||||||
|
|
||||||
# Ensure float32 format
|
# Ensure float32 format
|
||||||
embeddings_matrix = embeddings_matrix.astype(np.float32)
|
embeddings_matrix = embeddings_matrix.astype(np.float32)
|
||||||
|
|
||||||
@@ -600,7 +663,8 @@ class VectorStore:
|
|||||||
embedding_arr = embeddings_matrix[i]
|
embedding_arr = embeddings_matrix[i]
|
||||||
embedding_blob = embedding_arr.tobytes()
|
embedding_blob = embedding_arr.tobytes()
|
||||||
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
||||||
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
category = categories[i] if categories else "code"
|
||||||
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))
|
||||||
|
|
||||||
# Batch insert to SQLite in single transaction
|
# Batch insert to SQLite in single transaction
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
@@ -612,8 +676,8 @@ class VectorStore:
|
|||||||
|
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
batch_data
|
batch_data
|
||||||
)
|
)
|
||||||
@@ -765,6 +829,7 @@ class VectorStore:
|
|||||||
top_k: int = 10,
|
top_k: int = 10,
|
||||||
min_score: float = 0.0,
|
min_score: float = 0.0,
|
||||||
return_full_content: bool = True,
|
return_full_content: bool = True,
|
||||||
|
category: Optional[str] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Find chunks most similar to query embedding.
|
"""Find chunks most similar to query embedding.
|
||||||
|
|
||||||
@@ -776,6 +841,7 @@ class VectorStore:
|
|||||||
top_k: Maximum results to return.
|
top_k: Maximum results to return.
|
||||||
min_score: Minimum cosine similarity score in [0.0, 1.0].
|
min_score: Minimum cosine similarity score in [0.0, 1.0].
|
||||||
return_full_content: If True, return full code block content.
|
return_full_content: If True, return full code block content.
|
||||||
|
category: Optional category filter ('code' or 'doc'). If None, returns all.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SearchResult ordered by similarity (highest first).
|
List of SearchResult ordered by similarity (highest first).
|
||||||
@@ -796,14 +862,14 @@ class VectorStore:
|
|||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
return self._search_with_ann(
|
return self._search_with_ann(
|
||||||
query_vec, top_k, min_score, return_full_content
|
query_vec, top_k, min_score, return_full_content, category
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("ANN search failed, falling back to brute-force: %s", e)
|
logger.warning("ANN search failed, falling back to brute-force: %s", e)
|
||||||
|
|
||||||
# Fallback to brute-force search (O(N))
|
# Fallback to brute-force search (O(N))
|
||||||
return self._search_brute_force(
|
return self._search_brute_force(
|
||||||
query_vec, top_k, min_score, return_full_content
|
query_vec, top_k, min_score, return_full_content, category
|
||||||
)
|
)
|
||||||
|
|
||||||
def _search_with_ann(
|
def _search_with_ann(
|
||||||
@@ -812,6 +878,7 @@ class VectorStore:
|
|||||||
top_k: int,
|
top_k: int,
|
||||||
min_score: float,
|
min_score: float,
|
||||||
return_full_content: bool,
|
return_full_content: bool,
|
||||||
|
category: Optional[str] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Search using HNSW index (O(log N)).
|
"""Search using HNSW index (O(log N)).
|
||||||
|
|
||||||
@@ -820,13 +887,16 @@ class VectorStore:
|
|||||||
top_k: Maximum results to return
|
top_k: Maximum results to return
|
||||||
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
||||||
return_full_content: If True, return full code block content
|
return_full_content: If True, return full code block content
|
||||||
|
category: Optional category filter ('code' or 'doc')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SearchResult ordered by similarity (highest first)
|
List of SearchResult ordered by similarity (highest first)
|
||||||
"""
|
"""
|
||||||
# Limit top_k to available vectors to prevent hnswlib error
|
# Limit top_k to available vectors to prevent hnswlib error
|
||||||
ann_count = self._ann_index.count()
|
ann_count = self._ann_index.count()
|
||||||
effective_top_k = min(top_k, ann_count) if ann_count > 0 else 0
|
# When category filtering, fetch more candidates to compensate for filtering
|
||||||
|
fetch_k = top_k * 3 if category else top_k
|
||||||
|
effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0
|
||||||
|
|
||||||
if effective_top_k == 0:
|
if effective_top_k == 0:
|
||||||
return []
|
return []
|
||||||
@@ -875,8 +945,12 @@ class VectorStore:
|
|||||||
top_ids = [f[0] for f in filtered]
|
top_ids = [f[0] for f in filtered]
|
||||||
top_scores = [f[1] for f in filtered]
|
top_scores = [f[1] for f in filtered]
|
||||||
|
|
||||||
# Fetch content from SQLite
|
# Fetch content from SQLite with category filtering
|
||||||
return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)
|
results = self._fetch_results_by_ids(
|
||||||
|
top_ids, top_scores, return_full_content, category
|
||||||
|
)
|
||||||
|
# Apply final limit after category filtering
|
||||||
|
return results[:top_k]
|
||||||
|
|
||||||
def _search_brute_force(
|
def _search_brute_force(
|
||||||
self,
|
self,
|
||||||
@@ -884,6 +958,7 @@ class VectorStore:
|
|||||||
top_k: int,
|
top_k: int,
|
||||||
min_score: float,
|
min_score: float,
|
||||||
return_full_content: bool,
|
return_full_content: bool,
|
||||||
|
category: Optional[str] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Brute-force search using NumPy (O(N) fallback).
|
"""Brute-force search using NumPy (O(N) fallback).
|
||||||
|
|
||||||
@@ -892,6 +967,7 @@ class VectorStore:
|
|||||||
top_k: Maximum results to return
|
top_k: Maximum results to return
|
||||||
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
||||||
return_full_content: If True, return full code block content
|
return_full_content: If True, return full code block content
|
||||||
|
category: Optional category filter ('code' or 'doc')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SearchResult ordered by similarity (highest first)
|
List of SearchResult ordered by similarity (highest first)
|
||||||
@@ -926,27 +1002,31 @@ class VectorStore:
|
|||||||
if len(valid_indices) == 0:
|
if len(valid_indices) == 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Sort by score descending and take top_k
|
# When category filtering, fetch more candidates to compensate for filtering
|
||||||
|
fetch_k = top_k * 3 if category else top_k
|
||||||
|
|
||||||
|
# Sort by score descending and take top candidates
|
||||||
valid_scores = scores[valid_indices]
|
valid_scores = scores[valid_indices]
|
||||||
sorted_order = np.argsort(valid_scores)[::-1][:top_k]
|
sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]
|
||||||
top_indices = valid_indices[sorted_order]
|
top_indices = valid_indices[sorted_order]
|
||||||
top_scores = valid_scores[sorted_order]
|
top_scores = valid_scores[sorted_order]
|
||||||
|
|
||||||
# Get chunk IDs for top results
|
# Get chunk IDs for top results
|
||||||
top_ids = [self._chunk_ids[i] for i in top_indices]
|
top_ids = [self._chunk_ids[i] for i in top_indices]
|
||||||
|
|
||||||
# Fetch content only for top-k results (lazy loading)
|
# Fetch content only for top-k results (lazy loading) with category filtering
|
||||||
results = self._fetch_results_by_ids(
|
results = self._fetch_results_by_ids(
|
||||||
top_ids, top_scores.tolist(), return_full_content
|
top_ids, top_scores.tolist(), return_full_content, category
|
||||||
)
|
)
|
||||||
|
# Apply final limit after category filtering
|
||||||
return results
|
return results[:top_k]
|
||||||
|
|
||||||
def _fetch_results_by_ids(
|
def _fetch_results_by_ids(
|
||||||
self,
|
self,
|
||||||
chunk_ids: List[int],
|
chunk_ids: List[int],
|
||||||
scores: List[float],
|
scores: List[float],
|
||||||
return_full_content: bool,
|
return_full_content: bool,
|
||||||
|
category: Optional[str] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Fetch full result data for specific chunk IDs.
|
"""Fetch full result data for specific chunk IDs.
|
||||||
|
|
||||||
@@ -954,6 +1034,7 @@ class VectorStore:
|
|||||||
chunk_ids: List of chunk IDs to fetch.
|
chunk_ids: List of chunk IDs to fetch.
|
||||||
scores: Corresponding similarity scores.
|
scores: Corresponding similarity scores.
|
||||||
return_full_content: Whether to include full content.
|
return_full_content: Whether to include full content.
|
||||||
|
category: Optional category filter ('code' or 'doc').
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SearchResult objects.
|
List of SearchResult objects.
|
||||||
@@ -968,15 +1049,25 @@ class VectorStore:
|
|||||||
# SQL injection prevention:
|
# SQL injection prevention:
|
||||||
# - Only a validated placeholders string (commas + '?') is interpolated into the query.
|
# - Only a validated placeholders string (commas + '?') is interpolated into the query.
|
||||||
# - User-provided values are passed separately via sqlite3 parameters.
|
# - User-provided values are passed separately via sqlite3 parameters.
|
||||||
query = """
|
# - Category filter is added as a separate parameter
|
||||||
SELECT id, file_path, content, metadata
|
if category:
|
||||||
FROM semantic_chunks
|
query = """
|
||||||
WHERE id IN ({placeholders})
|
SELECT id, file_path, content, metadata
|
||||||
""".format(placeholders=placeholders)
|
FROM semantic_chunks
|
||||||
|
WHERE id IN ({placeholders}) AND category = ?
|
||||||
|
""".format(placeholders=placeholders)
|
||||||
|
params = list(chunk_ids) + [category]
|
||||||
|
else:
|
||||||
|
query = """
|
||||||
|
SELECT id, file_path, content, metadata
|
||||||
|
FROM semantic_chunks
|
||||||
|
WHERE id IN ({placeholders})
|
||||||
|
""".format(placeholders=placeholders)
|
||||||
|
params = chunk_ids
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
conn.execute("PRAGMA mmap_size = 30000000000")
|
conn.execute("PRAGMA mmap_size = 30000000000")
|
||||||
rows = conn.execute(query, chunk_ids).fetchall()
|
rows = conn.execute(query, params).fetchall()
|
||||||
|
|
||||||
# Build ID -> row mapping
|
# Build ID -> row mapping
|
||||||
id_to_row = {r[0]: r for r in rows}
|
id_to_row = {r[0]: r for r in rows}
|
||||||
|
|||||||
@@ -30,8 +30,22 @@ class WatcherConfig:
|
|||||||
"""Configuration for file watcher."""
|
"""Configuration for file watcher."""
|
||||||
debounce_ms: int = 1000
|
debounce_ms: int = 1000
|
||||||
ignored_patterns: Set[str] = field(default_factory=lambda: {
|
ignored_patterns: Set[str] = field(default_factory=lambda: {
|
||||||
".git", ".venv", "venv", "node_modules",
|
# Version control
|
||||||
"__pycache__", ".codexlens", ".idea", ".vscode",
|
".git", ".svn", ".hg",
|
||||||
|
# Python environments & cache
|
||||||
|
".venv", "venv", "env", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache",
|
||||||
|
# Node.js
|
||||||
|
"node_modules", "bower_components", ".npm", ".yarn",
|
||||||
|
# Build artifacts
|
||||||
|
"dist", "build", "out", "target", "bin", "obj", "_build", "coverage", "htmlcov",
|
||||||
|
# IDE & Editor
|
||||||
|
".idea", ".vscode", ".vs", ".eclipse",
|
||||||
|
# CodexLens internal
|
||||||
|
".codexlens",
|
||||||
|
# Package manager caches
|
||||||
|
".cache", ".parcel-cache", ".turbo", ".next", ".nuxt",
|
||||||
|
# Logs & temp
|
||||||
|
"logs", "tmp", "temp",
|
||||||
})
|
})
|
||||||
languages: Optional[List[str]] = None # None = all supported
|
languages: Optional[List[str]] = None # None = all supported
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user