mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例 测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
778 lines
26 KiB
Python
778 lines
26 KiB
Python
"""Vector storage and similarity search for semantic chunks.
|
|
|
|
Optimized for high-performance similarity search using:
|
|
- HNSW index for O(log N) approximate nearest neighbor search (primary)
|
|
- Cached embedding matrix for batch operations (fallback)
|
|
- NumPy vectorized cosine similarity (fallback, 100x+ faster than loops)
|
|
- Lazy content loading (only fetch for top-k results)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import sqlite3
|
|
import threading
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from codexlens.entities import SearchResult, SemanticChunk
|
|
from codexlens.errors import StorageError
|
|
|
|
from . import SEMANTIC_AVAILABLE
|
|
|
|
if SEMANTIC_AVAILABLE:
|
|
import numpy as np
|
|
|
|
# Try to import ANN index (optional hnswlib dependency)
|
|
try:
|
|
from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
|
|
except ImportError:
|
|
HNSWLIB_AVAILABLE = False
|
|
ANNIndex = None
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
"""Compute cosine similarity between two vectors."""
|
|
if not SEMANTIC_AVAILABLE:
|
|
raise ImportError("numpy required for vector operations")
|
|
|
|
a_arr = np.array(a)
|
|
b_arr = np.array(b)
|
|
|
|
norm_a = np.linalg.norm(a_arr)
|
|
norm_b = np.linalg.norm(b_arr)
|
|
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
|
|
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
|
|
|
|
|
|
class VectorStore:
|
|
"""SQLite-based vector storage with HNSW-accelerated similarity search.
|
|
|
|
Performance optimizations:
|
|
- HNSW index for O(log N) approximate nearest neighbor search
|
|
- Embedding matrix cached in memory for batch similarity computation (fallback)
|
|
- NumPy vectorized operations instead of Python loops (fallback)
|
|
- Lazy content loading - only fetch full content for top-k results
|
|
- Thread-safe cache invalidation
|
|
"""
|
|
|
|
# Default embedding dimension (used when creating new index)
|
|
DEFAULT_DIM = 768
|
|
|
|
def __init__(self, db_path: str | Path) -> None:
|
|
if not SEMANTIC_AVAILABLE:
|
|
raise ImportError(
|
|
"Semantic search dependencies not available. "
|
|
"Install with: pip install codexlens[semantic]"
|
|
)
|
|
|
|
self.db_path = Path(db_path)
|
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Embedding cache for fast similarity search (fallback)
|
|
self._cache_lock = threading.RLock()
|
|
self._embedding_matrix: Optional[np.ndarray] = None
|
|
self._embedding_norms: Optional[np.ndarray] = None
|
|
self._chunk_ids: Optional[List[int]] = None
|
|
self._cache_version: int = 0
|
|
|
|
# ANN index for O(log N) search
|
|
self._ann_index: Optional[ANNIndex] = None
|
|
self._ann_dim: Optional[int] = None
|
|
self._ann_write_lock = threading.Lock() # Protects ANN index modifications
|
|
|
|
self._init_schema()
|
|
self._init_ann_index()
|
|
|
|
def _init_schema(self) -> None:
|
|
"""Initialize vector storage schema."""
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
# Enable memory mapping for faster reads
|
|
conn.execute("PRAGMA mmap_size = 30000000000") # 30GB limit
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS semantic_chunks (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
file_path TEXT NOT NULL,
|
|
content TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
metadata TEXT,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
""")
|
|
conn.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_chunks_file
|
|
ON semantic_chunks(file_path)
|
|
""")
|
|
conn.commit()
|
|
|
|
def _init_ann_index(self) -> None:
|
|
"""Initialize ANN index (lazy loading from existing data)."""
|
|
if not HNSWLIB_AVAILABLE:
|
|
logger.debug("hnswlib not available, using brute-force search")
|
|
return
|
|
|
|
# Try to detect embedding dimension from existing data
|
|
dim = self._detect_embedding_dim()
|
|
if dim is None:
|
|
# No data yet, will initialize on first add
|
|
logger.debug("No embeddings found, ANN index will be created on first add")
|
|
return
|
|
|
|
self._ann_dim = dim
|
|
|
|
try:
|
|
self._ann_index = ANNIndex(self.db_path, dim)
|
|
if self._ann_index.load():
|
|
logger.debug(
|
|
"Loaded ANN index with %d vectors", self._ann_index.count()
|
|
)
|
|
else:
|
|
# Index file doesn't exist, try to build from SQLite data
|
|
logger.debug("ANN index file not found, rebuilding from SQLite")
|
|
self._rebuild_ann_index_internal()
|
|
except Exception as e:
|
|
logger.warning("Failed to initialize ANN index: %s", e)
|
|
self._ann_index = None
|
|
|
|
def _detect_embedding_dim(self) -> Optional[int]:
|
|
"""Detect embedding dimension from existing data."""
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
row = conn.execute(
|
|
"SELECT embedding FROM semantic_chunks LIMIT 1"
|
|
).fetchone()
|
|
if row and row[0]:
|
|
# Embedding is stored as float32 blob
|
|
blob = row[0]
|
|
return len(blob) // np.dtype(np.float32).itemsize
|
|
return None
|
|
|
|
@property
|
|
def dimension(self) -> Optional[int]:
|
|
"""Return the dimension of embeddings in the store.
|
|
|
|
Returns:
|
|
Embedding dimension if available, None if store is empty.
|
|
"""
|
|
if self._ann_dim is not None:
|
|
return self._ann_dim
|
|
self._ann_dim = self._detect_embedding_dim()
|
|
return self._ann_dim
|
|
|
|
def _rebuild_ann_index_internal(self) -> int:
|
|
"""Internal method to rebuild ANN index from SQLite data."""
|
|
if self._ann_index is None:
|
|
return 0
|
|
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
conn.execute("PRAGMA mmap_size = 30000000000")
|
|
rows = conn.execute(
|
|
"SELECT id, embedding FROM semantic_chunks"
|
|
).fetchall()
|
|
|
|
if not rows:
|
|
return 0
|
|
|
|
# Extract IDs and embeddings
|
|
ids = [r[0] for r in rows]
|
|
embeddings = np.vstack([
|
|
np.frombuffer(r[1], dtype=np.float32) for r in rows
|
|
])
|
|
|
|
# Add to ANN index
|
|
self._ann_index.add_vectors(ids, embeddings)
|
|
self._ann_index.save()
|
|
|
|
logger.info("Rebuilt ANN index with %d vectors", len(ids))
|
|
return len(ids)
|
|
|
|
def rebuild_ann_index(self) -> int:
|
|
"""Rebuild HNSW index from all chunks in SQLite.
|
|
|
|
Use this method to:
|
|
- Migrate existing data to use ANN search
|
|
- Repair corrupted index
|
|
- Reclaim space after many deletions
|
|
|
|
Returns:
|
|
Number of vectors indexed.
|
|
"""
|
|
if not HNSWLIB_AVAILABLE:
|
|
logger.warning("hnswlib not available, cannot rebuild ANN index")
|
|
return 0
|
|
|
|
# Detect dimension
|
|
dim = self._detect_embedding_dim()
|
|
if dim is None:
|
|
logger.warning("No embeddings found, cannot rebuild ANN index")
|
|
return 0
|
|
|
|
self._ann_dim = dim
|
|
|
|
# Create new index
|
|
try:
|
|
self._ann_index = ANNIndex(self.db_path, dim)
|
|
return self._rebuild_ann_index_internal()
|
|
except Exception as e:
|
|
logger.error("Failed to rebuild ANN index: %s", e)
|
|
self._ann_index = None
|
|
return 0
|
|
|
|
def _invalidate_cache(self) -> None:
|
|
"""Invalidate the embedding cache (thread-safe)."""
|
|
with self._cache_lock:
|
|
self._embedding_matrix = None
|
|
self._embedding_norms = None
|
|
self._chunk_ids = None
|
|
self._cache_version += 1
|
|
|
|
def _refresh_cache(self) -> bool:
|
|
"""Load embeddings into numpy matrix for fast similarity search.
|
|
|
|
Returns:
|
|
True if cache was refreshed successfully, False if no data.
|
|
"""
|
|
with self._cache_lock:
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
conn.execute("PRAGMA mmap_size = 30000000000")
|
|
rows = conn.execute(
|
|
"SELECT id, embedding FROM semantic_chunks"
|
|
).fetchall()
|
|
|
|
if not rows:
|
|
self._embedding_matrix = None
|
|
self._embedding_norms = None
|
|
self._chunk_ids = None
|
|
return False
|
|
|
|
# Extract IDs and embeddings
|
|
self._chunk_ids = [r[0] for r in rows]
|
|
|
|
# Bulk convert binary blobs to numpy matrix
|
|
embeddings = [
|
|
np.frombuffer(r[1], dtype=np.float32) for r in rows
|
|
]
|
|
self._embedding_matrix = np.vstack(embeddings)
|
|
|
|
# Pre-compute norms for faster similarity calculation
|
|
self._embedding_norms = np.linalg.norm(
|
|
self._embedding_matrix, axis=1, keepdims=True
|
|
)
|
|
# Avoid division by zero
|
|
self._embedding_norms = np.where(
|
|
self._embedding_norms == 0, 1e-10, self._embedding_norms
|
|
)
|
|
|
|
return True
|
|
|
|
def _ensure_ann_index(self, dim: int) -> bool:
|
|
"""Ensure ANN index is initialized with correct dimension.
|
|
|
|
This method is thread-safe and uses double-checked locking.
|
|
|
|
Args:
|
|
dim: Embedding dimension
|
|
|
|
Returns:
|
|
True if ANN index is ready, False otherwise
|
|
"""
|
|
if not HNSWLIB_AVAILABLE:
|
|
return False
|
|
|
|
# Fast path: index already initialized (no lock needed)
|
|
if self._ann_index is not None:
|
|
return True
|
|
|
|
# Slow path: acquire lock for initialization
|
|
with self._ann_write_lock:
|
|
# Double-check after acquiring lock
|
|
if self._ann_index is not None:
|
|
return True
|
|
|
|
try:
|
|
self._ann_dim = dim
|
|
self._ann_index = ANNIndex(self.db_path, dim)
|
|
self._ann_index.load() # Try to load existing
|
|
return True
|
|
except Exception as e:
|
|
logger.warning("Failed to initialize ANN index: %s", e)
|
|
self._ann_index = None
|
|
return False
|
|
|
|
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
|
|
"""Add a single chunk with its embedding.
|
|
|
|
Returns:
|
|
The inserted chunk ID.
|
|
"""
|
|
if chunk.embedding is None:
|
|
raise ValueError("Chunk must have embedding before adding to store")
|
|
|
|
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
|
embedding_blob = embedding_arr.tobytes()
|
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
|
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
cursor = conn.execute(
|
|
"""
|
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
|
VALUES (?, ?, ?, ?)
|
|
""",
|
|
(file_path, chunk.content, embedding_blob, metadata_json)
|
|
)
|
|
conn.commit()
|
|
chunk_id = cursor.lastrowid or 0
|
|
|
|
# Add to ANN index
|
|
if self._ensure_ann_index(len(chunk.embedding)):
|
|
with self._ann_write_lock:
|
|
try:
|
|
self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))
|
|
self._ann_index.save()
|
|
except Exception as e:
|
|
logger.warning("Failed to add to ANN index: %s", e)
|
|
|
|
# Invalidate cache after modification
|
|
self._invalidate_cache()
|
|
return chunk_id
|
|
|
|
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
|
|
"""Add multiple chunks with embeddings (batch insert).
|
|
|
|
Returns:
|
|
List of inserted chunk IDs.
|
|
"""
|
|
if not chunks:
|
|
return []
|
|
|
|
# Prepare batch data
|
|
batch_data = []
|
|
embeddings_list = []
|
|
for chunk in chunks:
|
|
if chunk.embedding is None:
|
|
raise ValueError("All chunks must have embeddings")
|
|
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
|
embedding_blob = embedding_arr.tobytes()
|
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
|
embeddings_list.append(embedding_arr)
|
|
|
|
# Batch insert to SQLite
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
# Get starting ID before insert
|
|
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
start_id = (row[0] or 0) + 1
|
|
|
|
conn.executemany(
|
|
"""
|
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
|
VALUES (?, ?, ?, ?)
|
|
""",
|
|
batch_data
|
|
)
|
|
conn.commit()
|
|
# Calculate inserted IDs based on starting ID
|
|
ids = list(range(start_id, start_id + len(chunks)))
|
|
|
|
# Add to ANN index
|
|
if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
|
|
with self._ann_write_lock:
|
|
try:
|
|
embeddings_matrix = np.vstack(embeddings_list)
|
|
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
self._ann_index.save()
|
|
except Exception as e:
|
|
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
|
|
# Invalidate cache after modification
|
|
self._invalidate_cache()
|
|
return ids
|
|
|
|
def add_chunks_batch(
|
|
self, chunks_with_paths: List[Tuple[SemanticChunk, str]]
|
|
) -> List[int]:
|
|
"""Batch insert chunks from multiple files in a single transaction.
|
|
|
|
This method is optimized for bulk operations during index generation.
|
|
|
|
Args:
|
|
chunks_with_paths: List of (chunk, file_path) tuples
|
|
|
|
Returns:
|
|
List of inserted chunk IDs
|
|
"""
|
|
if not chunks_with_paths:
|
|
return []
|
|
|
|
# Prepare batch data
|
|
batch_data = []
|
|
embeddings_list = []
|
|
for chunk, file_path in chunks_with_paths:
|
|
if chunk.embedding is None:
|
|
raise ValueError("All chunks must have embeddings")
|
|
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
|
|
embedding_blob = embedding_arr.tobytes()
|
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
|
embeddings_list.append(embedding_arr)
|
|
|
|
# Batch insert to SQLite in single transaction
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
# Get starting ID before insert
|
|
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
start_id = (row[0] or 0) + 1
|
|
|
|
conn.executemany(
|
|
"""
|
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
|
VALUES (?, ?, ?, ?)
|
|
""",
|
|
batch_data
|
|
)
|
|
conn.commit()
|
|
# Calculate inserted IDs based on starting ID
|
|
ids = list(range(start_id, start_id + len(chunks_with_paths)))
|
|
|
|
# Add to ANN index
|
|
if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
|
|
with self._ann_write_lock:
|
|
try:
|
|
embeddings_matrix = np.vstack(embeddings_list)
|
|
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
self._ann_index.save()
|
|
except Exception as e:
|
|
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
|
|
# Invalidate cache after modification
|
|
self._invalidate_cache()
|
|
return ids
|
|
|
|
def delete_file_chunks(self, file_path: str) -> int:
|
|
"""Delete all chunks for a file.
|
|
|
|
Returns:
|
|
Number of deleted chunks.
|
|
"""
|
|
# Get chunk IDs before deletion (for ANN index)
|
|
chunk_ids_to_delete = []
|
|
if self._ann_index is not None:
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
rows = conn.execute(
|
|
"SELECT id FROM semantic_chunks WHERE file_path = ?",
|
|
(file_path,)
|
|
).fetchall()
|
|
chunk_ids_to_delete = [r[0] for r in rows]
|
|
|
|
# Delete from SQLite
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
cursor = conn.execute(
|
|
"DELETE FROM semantic_chunks WHERE file_path = ?",
|
|
(file_path,)
|
|
)
|
|
conn.commit()
|
|
deleted = cursor.rowcount
|
|
|
|
# Remove from ANN index
|
|
if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:
|
|
with self._ann_write_lock:
|
|
try:
|
|
self._ann_index.remove_vectors(chunk_ids_to_delete)
|
|
self._ann_index.save()
|
|
except Exception as e:
|
|
logger.warning("Failed to remove from ANN index: %s", e)
|
|
|
|
if deleted > 0:
|
|
self._invalidate_cache()
|
|
return deleted
|
|
|
|
def search_similar(
|
|
self,
|
|
query_embedding: List[float],
|
|
top_k: int = 10,
|
|
min_score: float = 0.0,
|
|
return_full_content: bool = True,
|
|
) -> List[SearchResult]:
|
|
"""Find chunks most similar to query embedding.
|
|
|
|
Uses HNSW index for O(log N) search when available, falls back to
|
|
brute-force NumPy search otherwise.
|
|
|
|
Args:
|
|
query_embedding: Query vector.
|
|
top_k: Maximum results to return.
|
|
min_score: Minimum similarity score (0-1).
|
|
return_full_content: If True, return full code block content.
|
|
|
|
Returns:
|
|
List of SearchResult ordered by similarity (highest first).
|
|
"""
|
|
query_vec = np.array(query_embedding, dtype=np.float32)
|
|
|
|
# Try HNSW search first (O(log N))
|
|
if (
|
|
HNSWLIB_AVAILABLE
|
|
and self._ann_index is not None
|
|
and self._ann_index.is_loaded
|
|
and self._ann_index.count() > 0
|
|
):
|
|
try:
|
|
return self._search_with_ann(
|
|
query_vec, top_k, min_score, return_full_content
|
|
)
|
|
except Exception as e:
|
|
logger.warning("ANN search failed, falling back to brute-force: %s", e)
|
|
|
|
# Fallback to brute-force search (O(N))
|
|
return self._search_brute_force(
|
|
query_vec, top_k, min_score, return_full_content
|
|
)
|
|
|
|
def _search_with_ann(
|
|
self,
|
|
query_vec: np.ndarray,
|
|
top_k: int,
|
|
min_score: float,
|
|
return_full_content: bool,
|
|
) -> List[SearchResult]:
|
|
"""Search using HNSW index (O(log N)).
|
|
|
|
Args:
|
|
query_vec: Query vector as numpy array
|
|
top_k: Maximum results to return
|
|
min_score: Minimum similarity score (0-1)
|
|
return_full_content: If True, return full code block content
|
|
|
|
Returns:
|
|
List of SearchResult ordered by similarity (highest first)
|
|
"""
|
|
# Limit top_k to available vectors to prevent hnswlib error
|
|
ann_count = self._ann_index.count()
|
|
effective_top_k = min(top_k, ann_count) if ann_count > 0 else 0
|
|
|
|
if effective_top_k == 0:
|
|
return []
|
|
|
|
# HNSW search returns (ids, distances)
|
|
# For cosine space: distance = 1 - similarity
|
|
ids, distances = self._ann_index.search(query_vec, effective_top_k)
|
|
|
|
if not ids:
|
|
return []
|
|
|
|
# Convert distances to similarity scores
|
|
scores = [1.0 - d for d in distances]
|
|
|
|
# Filter by min_score
|
|
filtered = [
|
|
(chunk_id, score)
|
|
for chunk_id, score in zip(ids, scores)
|
|
if score >= min_score
|
|
]
|
|
|
|
if not filtered:
|
|
return []
|
|
|
|
top_ids = [f[0] for f in filtered]
|
|
top_scores = [f[1] for f in filtered]
|
|
|
|
# Fetch content from SQLite
|
|
return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)
|
|
|
|
def _search_brute_force(
|
|
self,
|
|
query_vec: np.ndarray,
|
|
top_k: int,
|
|
min_score: float,
|
|
return_full_content: bool,
|
|
) -> List[SearchResult]:
|
|
"""Brute-force search using NumPy (O(N) fallback).
|
|
|
|
Args:
|
|
query_vec: Query vector as numpy array
|
|
top_k: Maximum results to return
|
|
min_score: Minimum similarity score (0-1)
|
|
return_full_content: If True, return full code block content
|
|
|
|
Returns:
|
|
List of SearchResult ordered by similarity (highest first)
|
|
"""
|
|
logger.warning(
|
|
"Using brute-force vector search (hnswlib not available). "
|
|
"This may cause high memory usage for large indexes. "
|
|
"Install hnswlib for better performance: pip install hnswlib"
|
|
)
|
|
|
|
with self._cache_lock:
|
|
# Refresh cache if needed
|
|
if self._embedding_matrix is None:
|
|
if not self._refresh_cache():
|
|
return [] # No data
|
|
|
|
# Vectorized cosine similarity
|
|
query_vec = query_vec.reshape(1, -1)
|
|
query_norm = np.linalg.norm(query_vec)
|
|
if query_norm == 0:
|
|
return []
|
|
|
|
# Compute all similarities at once: (N,) scores
|
|
# similarity = (A @ B.T) / (||A|| * ||B||)
|
|
dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()
|
|
scores = dot_products / (self._embedding_norms.flatten() * query_norm)
|
|
|
|
# Filter by min_score and get top-k indices
|
|
valid_mask = scores >= min_score
|
|
valid_indices = np.where(valid_mask)[0]
|
|
|
|
if len(valid_indices) == 0:
|
|
return []
|
|
|
|
# Sort by score descending and take top_k
|
|
valid_scores = scores[valid_indices]
|
|
sorted_order = np.argsort(valid_scores)[::-1][:top_k]
|
|
top_indices = valid_indices[sorted_order]
|
|
top_scores = valid_scores[sorted_order]
|
|
|
|
# Get chunk IDs for top results
|
|
top_ids = [self._chunk_ids[i] for i in top_indices]
|
|
|
|
# Fetch content only for top-k results (lazy loading)
|
|
results = self._fetch_results_by_ids(
|
|
top_ids, top_scores.tolist(), return_full_content
|
|
)
|
|
|
|
return results
|
|
|
|
def _fetch_results_by_ids(
|
|
self,
|
|
chunk_ids: List[int],
|
|
scores: List[float],
|
|
return_full_content: bool,
|
|
) -> List[SearchResult]:
|
|
"""Fetch full result data for specific chunk IDs.
|
|
|
|
Args:
|
|
chunk_ids: List of chunk IDs to fetch.
|
|
scores: Corresponding similarity scores.
|
|
return_full_content: Whether to include full content.
|
|
|
|
Returns:
|
|
List of SearchResult objects.
|
|
"""
|
|
if not chunk_ids:
|
|
return []
|
|
|
|
# Build parameterized query for IN clause
|
|
placeholders = ",".join("?" * len(chunk_ids))
|
|
query = f"""
|
|
SELECT id, file_path, content, metadata
|
|
FROM semantic_chunks
|
|
WHERE id IN ({placeholders})
|
|
"""
|
|
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
conn.execute("PRAGMA mmap_size = 30000000000")
|
|
rows = conn.execute(query, chunk_ids).fetchall()
|
|
|
|
# Build ID -> row mapping
|
|
id_to_row = {r[0]: r for r in rows}
|
|
|
|
results = []
|
|
for chunk_id, score in zip(chunk_ids, scores):
|
|
row = id_to_row.get(chunk_id)
|
|
if not row:
|
|
continue
|
|
|
|
_, file_path, content, metadata_json = row
|
|
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
|
|
# Build excerpt (short preview)
|
|
excerpt = content[:200] + "..." if len(content) > 200 else content
|
|
|
|
# Extract symbol information from metadata
|
|
symbol_name = metadata.get("symbol_name")
|
|
symbol_kind = metadata.get("symbol_kind")
|
|
start_line = metadata.get("start_line")
|
|
end_line = metadata.get("end_line")
|
|
|
|
# Build Symbol object if we have symbol info
|
|
symbol = None
|
|
if symbol_name and symbol_kind and start_line and end_line:
|
|
try:
|
|
from codexlens.entities import Symbol
|
|
symbol = Symbol(
|
|
name=symbol_name,
|
|
kind=symbol_kind,
|
|
range=(start_line, end_line)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
results.append(SearchResult(
|
|
path=file_path,
|
|
score=score,
|
|
excerpt=excerpt,
|
|
content=content if return_full_content else None,
|
|
symbol=symbol,
|
|
metadata=metadata,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
symbol_name=symbol_name,
|
|
symbol_kind=symbol_kind,
|
|
))
|
|
|
|
return results
|
|
|
|
def count_chunks(self) -> int:
|
|
"""Count total chunks in store."""
|
|
with sqlite3.connect(self.db_path) as conn:
|
|
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
|
|
return row[0] if row else 0
|
|
|
|
def clear_cache(self) -> None:
|
|
"""Manually clear the embedding cache."""
|
|
self._invalidate_cache()
|
|
|
|
@property
|
|
def ann_available(self) -> bool:
|
|
"""Check if ANN index is available and ready."""
|
|
return (
|
|
HNSWLIB_AVAILABLE
|
|
and self._ann_index is not None
|
|
and self._ann_index.is_loaded
|
|
)
|
|
|
|
@property
|
|
def ann_count(self) -> int:
|
|
"""Get number of vectors in ANN index."""
|
|
if self._ann_index is not None:
|
|
return self._ann_index.count()
|
|
return 0
|
|
|
|
def close(self) -> None:
|
|
"""Close the vector store and release resources.
|
|
|
|
This ensures SQLite connections are closed and ANN index is cleared,
|
|
allowing temporary files to be deleted on Windows.
|
|
"""
|
|
with self._cache_lock:
|
|
self._embedding_matrix = None
|
|
self._embedding_norms = None
|
|
self._chunk_ids = None
|
|
|
|
with self._ann_write_lock:
|
|
self._ann_index = None
|
|
|
|
def __enter__(self) -> "VectorStore":
|
|
"""Context manager entry."""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
"""Context manager exit - close resources."""
|
|
self.close()
|