Files
Claude-Code-Workflow/codex-lens/src/codexlens/semantic/vector_store.py
catlog22 3e9a309079 refactor: 移除图索引功能,修复内存泄露,优化嵌入生成
主要更改:

1. 移除图索引功能 (graph indexing)
   - 删除 graph_analyzer.py 及相关迁移文件
   - 移除 CLI 的 graph 命令和 --enrich 标志
   - 清理 chain_search.py 中的图查询方法 (370行)
   - 删除相关测试文件

2. 修复嵌入生成内存问题
   - 重构 generate_embeddings.py 使用流式批处理
   - 改用 embedding_manager 的内存安全实现
   - 文件从 548 行精简到 259 行 (52.7% 减少)

3. 修复内存泄露
   - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine
   - embedding_manager.py: 使用 with 语句管理 VectorStore
   - vector_store.py: 添加暴力搜索内存警告

4. 代码清理
   - 移除 Symbol 模型的 token_count 和 symbol_type 字段
   - 清理相关测试用例

测试: 760 passed, 7 skipped

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 16:22:03 +08:00

778 lines
26 KiB
Python

"""Vector storage and similarity search for semantic chunks.
Optimized for high-performance similarity search using:
- HNSW index for O(log N) approximate nearest neighbor search (primary)
- Cached embedding matrix for batch operations (fallback)
- NumPy vectorized cosine similarity (fallback, 100x+ faster than loops)
- Lazy content loading (only fetch for top-k results)
"""
from __future__ import annotations
import json
import logging
import sqlite3
import threading
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from codexlens.entities import SearchResult, SemanticChunk
from codexlens.errors import StorageError
from . import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
import numpy as np
# Try to import ANN index (optional hnswlib dependency)
try:
from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
except ImportError:
HNSWLIB_AVAILABLE = False
ANNIndex = None
logger = logging.getLogger(__name__)
def _cosine_similarity(a: List[float], b: List[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not SEMANTIC_AVAILABLE:
raise ImportError("numpy required for vector operations")
a_arr = np.array(a)
b_arr = np.array(b)
norm_a = np.linalg.norm(a_arr)
norm_b = np.linalg.norm(b_arr)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
class VectorStore:
"""SQLite-based vector storage with HNSW-accelerated similarity search.
Performance optimizations:
- HNSW index for O(log N) approximate nearest neighbor search
- Embedding matrix cached in memory for batch similarity computation (fallback)
- NumPy vectorized operations instead of Python loops (fallback)
- Lazy content loading - only fetch full content for top-k results
- Thread-safe cache invalidation
"""
# Default embedding dimension (used when creating new index)
DEFAULT_DIM = 768
def __init__(self, db_path: str | Path) -> None:
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Embedding cache for fast similarity search (fallback)
self._cache_lock = threading.RLock()
self._embedding_matrix: Optional[np.ndarray] = None
self._embedding_norms: Optional[np.ndarray] = None
self._chunk_ids: Optional[List[int]] = None
self._cache_version: int = 0
# ANN index for O(log N) search
self._ann_index: Optional[ANNIndex] = None
self._ann_dim: Optional[int] = None
self._ann_write_lock = threading.Lock() # Protects ANN index modifications
self._init_schema()
self._init_ann_index()
def _init_schema(self) -> None:
"""Initialize vector storage schema."""
with sqlite3.connect(self.db_path) as conn:
# Enable memory mapping for faster reads
conn.execute("PRAGMA mmap_size = 30000000000") # 30GB limit
conn.execute("""
CREATE TABLE IF NOT EXISTS semantic_chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB NOT NULL,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_chunks_file
ON semantic_chunks(file_path)
""")
conn.commit()
def _init_ann_index(self) -> None:
"""Initialize ANN index (lazy loading from existing data)."""
if not HNSWLIB_AVAILABLE:
logger.debug("hnswlib not available, using brute-force search")
return
# Try to detect embedding dimension from existing data
dim = self._detect_embedding_dim()
if dim is None:
# No data yet, will initialize on first add
logger.debug("No embeddings found, ANN index will be created on first add")
return
self._ann_dim = dim
try:
self._ann_index = ANNIndex(self.db_path, dim)
if self._ann_index.load():
logger.debug(
"Loaded ANN index with %d vectors", self._ann_index.count()
)
else:
# Index file doesn't exist, try to build from SQLite data
logger.debug("ANN index file not found, rebuilding from SQLite")
self._rebuild_ann_index_internal()
except Exception as e:
logger.warning("Failed to initialize ANN index: %s", e)
self._ann_index = None
def _detect_embedding_dim(self) -> Optional[int]:
"""Detect embedding dimension from existing data."""
with sqlite3.connect(self.db_path) as conn:
row = conn.execute(
"SELECT embedding FROM semantic_chunks LIMIT 1"
).fetchone()
if row and row[0]:
# Embedding is stored as float32 blob
blob = row[0]
return len(blob) // np.dtype(np.float32).itemsize
return None
@property
def dimension(self) -> Optional[int]:
"""Return the dimension of embeddings in the store.
Returns:
Embedding dimension if available, None if store is empty.
"""
if self._ann_dim is not None:
return self._ann_dim
self._ann_dim = self._detect_embedding_dim()
return self._ann_dim
def _rebuild_ann_index_internal(self) -> int:
"""Internal method to rebuild ANN index from SQLite data."""
if self._ann_index is None:
return 0
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA mmap_size = 30000000000")
rows = conn.execute(
"SELECT id, embedding FROM semantic_chunks"
).fetchall()
if not rows:
return 0
# Extract IDs and embeddings
ids = [r[0] for r in rows]
embeddings = np.vstack([
np.frombuffer(r[1], dtype=np.float32) for r in rows
])
# Add to ANN index
self._ann_index.add_vectors(ids, embeddings)
self._ann_index.save()
logger.info("Rebuilt ANN index with %d vectors", len(ids))
return len(ids)
def rebuild_ann_index(self) -> int:
"""Rebuild HNSW index from all chunks in SQLite.
Use this method to:
- Migrate existing data to use ANN search
- Repair corrupted index
- Reclaim space after many deletions
Returns:
Number of vectors indexed.
"""
if not HNSWLIB_AVAILABLE:
logger.warning("hnswlib not available, cannot rebuild ANN index")
return 0
# Detect dimension
dim = self._detect_embedding_dim()
if dim is None:
logger.warning("No embeddings found, cannot rebuild ANN index")
return 0
self._ann_dim = dim
# Create new index
try:
self._ann_index = ANNIndex(self.db_path, dim)
return self._rebuild_ann_index_internal()
except Exception as e:
logger.error("Failed to rebuild ANN index: %s", e)
self._ann_index = None
return 0
def _invalidate_cache(self) -> None:
"""Invalidate the embedding cache (thread-safe)."""
with self._cache_lock:
self._embedding_matrix = None
self._embedding_norms = None
self._chunk_ids = None
self._cache_version += 1
def _refresh_cache(self) -> bool:
"""Load embeddings into numpy matrix for fast similarity search.
Returns:
True if cache was refreshed successfully, False if no data.
"""
with self._cache_lock:
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA mmap_size = 30000000000")
rows = conn.execute(
"SELECT id, embedding FROM semantic_chunks"
).fetchall()
if not rows:
self._embedding_matrix = None
self._embedding_norms = None
self._chunk_ids = None
return False
# Extract IDs and embeddings
self._chunk_ids = [r[0] for r in rows]
# Bulk convert binary blobs to numpy matrix
embeddings = [
np.frombuffer(r[1], dtype=np.float32) for r in rows
]
self._embedding_matrix = np.vstack(embeddings)
# Pre-compute norms for faster similarity calculation
self._embedding_norms = np.linalg.norm(
self._embedding_matrix, axis=1, keepdims=True
)
# Avoid division by zero
self._embedding_norms = np.where(
self._embedding_norms == 0, 1e-10, self._embedding_norms
)
return True
def _ensure_ann_index(self, dim: int) -> bool:
"""Ensure ANN index is initialized with correct dimension.
This method is thread-safe and uses double-checked locking.
Args:
dim: Embedding dimension
Returns:
True if ANN index is ready, False otherwise
"""
if not HNSWLIB_AVAILABLE:
return False
# Fast path: index already initialized (no lock needed)
if self._ann_index is not None:
return True
# Slow path: acquire lock for initialization
with self._ann_write_lock:
# Double-check after acquiring lock
if self._ann_index is not None:
return True
try:
self._ann_dim = dim
self._ann_index = ANNIndex(self.db_path, dim)
self._ann_index.load() # Try to load existing
return True
except Exception as e:
logger.warning("Failed to initialize ANN index: %s", e)
self._ann_index = None
return False
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
"""Add a single chunk with its embedding.
Returns:
The inserted chunk ID.
"""
if chunk.embedding is None:
raise ValueError("Chunk must have embedding before adding to store")
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
""",
(file_path, chunk.content, embedding_blob, metadata_json)
)
conn.commit()
chunk_id = cursor.lastrowid or 0
# Add to ANN index
if self._ensure_ann_index(len(chunk.embedding)):
with self._ann_write_lock:
try:
self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))
self._ann_index.save()
except Exception as e:
logger.warning("Failed to add to ANN index: %s", e)
# Invalidate cache after modification
self._invalidate_cache()
return chunk_id
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
"""Add multiple chunks with embeddings (batch insert).
Returns:
List of inserted chunk IDs.
"""
if not chunks:
return []
# Prepare batch data
batch_data = []
embeddings_list = []
for chunk in chunks:
if chunk.embedding is None:
raise ValueError("All chunks must have embeddings")
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
embeddings_list.append(embedding_arr)
# Batch insert to SQLite
with sqlite3.connect(self.db_path) as conn:
# Get starting ID before insert
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
start_id = (row[0] or 0) + 1
conn.executemany(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
""",
batch_data
)
conn.commit()
# Calculate inserted IDs based on starting ID
ids = list(range(start_id, start_id + len(chunks)))
# Add to ANN index
if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
with self._ann_write_lock:
try:
embeddings_matrix = np.vstack(embeddings_list)
self._ann_index.add_vectors(ids, embeddings_matrix)
self._ann_index.save()
except Exception as e:
logger.warning("Failed to add batch to ANN index: %s", e)
# Invalidate cache after modification
self._invalidate_cache()
return ids
def add_chunks_batch(
self, chunks_with_paths: List[Tuple[SemanticChunk, str]]
) -> List[int]:
"""Batch insert chunks from multiple files in a single transaction.
This method is optimized for bulk operations during index generation.
Args:
chunks_with_paths: List of (chunk, file_path) tuples
Returns:
List of inserted chunk IDs
"""
if not chunks_with_paths:
return []
# Prepare batch data
batch_data = []
embeddings_list = []
for chunk, file_path in chunks_with_paths:
if chunk.embedding is None:
raise ValueError("All chunks must have embeddings")
embedding_arr = np.array(chunk.embedding, dtype=np.float32)
embedding_blob = embedding_arr.tobytes()
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
embeddings_list.append(embedding_arr)
# Batch insert to SQLite in single transaction
with sqlite3.connect(self.db_path) as conn:
# Get starting ID before insert
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
start_id = (row[0] or 0) + 1
conn.executemany(
"""
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
VALUES (?, ?, ?, ?)
""",
batch_data
)
conn.commit()
# Calculate inserted IDs based on starting ID
ids = list(range(start_id, start_id + len(chunks_with_paths)))
# Add to ANN index
if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
with self._ann_write_lock:
try:
embeddings_matrix = np.vstack(embeddings_list)
self._ann_index.add_vectors(ids, embeddings_matrix)
self._ann_index.save()
except Exception as e:
logger.warning("Failed to add batch to ANN index: %s", e)
# Invalidate cache after modification
self._invalidate_cache()
return ids
def delete_file_chunks(self, file_path: str) -> int:
"""Delete all chunks for a file.
Returns:
Number of deleted chunks.
"""
# Get chunk IDs before deletion (for ANN index)
chunk_ids_to_delete = []
if self._ann_index is not None:
with sqlite3.connect(self.db_path) as conn:
rows = conn.execute(
"SELECT id FROM semantic_chunks WHERE file_path = ?",
(file_path,)
).fetchall()
chunk_ids_to_delete = [r[0] for r in rows]
# Delete from SQLite
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"DELETE FROM semantic_chunks WHERE file_path = ?",
(file_path,)
)
conn.commit()
deleted = cursor.rowcount
# Remove from ANN index
if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:
with self._ann_write_lock:
try:
self._ann_index.remove_vectors(chunk_ids_to_delete)
self._ann_index.save()
except Exception as e:
logger.warning("Failed to remove from ANN index: %s", e)
if deleted > 0:
self._invalidate_cache()
return deleted
def search_similar(
self,
query_embedding: List[float],
top_k: int = 10,
min_score: float = 0.0,
return_full_content: bool = True,
) -> List[SearchResult]:
"""Find chunks most similar to query embedding.
Uses HNSW index for O(log N) search when available, falls back to
brute-force NumPy search otherwise.
Args:
query_embedding: Query vector.
top_k: Maximum results to return.
min_score: Minimum similarity score (0-1).
return_full_content: If True, return full code block content.
Returns:
List of SearchResult ordered by similarity (highest first).
"""
query_vec = np.array(query_embedding, dtype=np.float32)
# Try HNSW search first (O(log N))
if (
HNSWLIB_AVAILABLE
and self._ann_index is not None
and self._ann_index.is_loaded
and self._ann_index.count() > 0
):
try:
return self._search_with_ann(
query_vec, top_k, min_score, return_full_content
)
except Exception as e:
logger.warning("ANN search failed, falling back to brute-force: %s", e)
# Fallback to brute-force search (O(N))
return self._search_brute_force(
query_vec, top_k, min_score, return_full_content
)
def _search_with_ann(
self,
query_vec: np.ndarray,
top_k: int,
min_score: float,
return_full_content: bool,
) -> List[SearchResult]:
"""Search using HNSW index (O(log N)).
Args:
query_vec: Query vector as numpy array
top_k: Maximum results to return
min_score: Minimum similarity score (0-1)
return_full_content: If True, return full code block content
Returns:
List of SearchResult ordered by similarity (highest first)
"""
# Limit top_k to available vectors to prevent hnswlib error
ann_count = self._ann_index.count()
effective_top_k = min(top_k, ann_count) if ann_count > 0 else 0
if effective_top_k == 0:
return []
# HNSW search returns (ids, distances)
# For cosine space: distance = 1 - similarity
ids, distances = self._ann_index.search(query_vec, effective_top_k)
if not ids:
return []
# Convert distances to similarity scores
scores = [1.0 - d for d in distances]
# Filter by min_score
filtered = [
(chunk_id, score)
for chunk_id, score in zip(ids, scores)
if score >= min_score
]
if not filtered:
return []
top_ids = [f[0] for f in filtered]
top_scores = [f[1] for f in filtered]
# Fetch content from SQLite
return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)
def _search_brute_force(
self,
query_vec: np.ndarray,
top_k: int,
min_score: float,
return_full_content: bool,
) -> List[SearchResult]:
"""Brute-force search using NumPy (O(N) fallback).
Args:
query_vec: Query vector as numpy array
top_k: Maximum results to return
min_score: Minimum similarity score (0-1)
return_full_content: If True, return full code block content
Returns:
List of SearchResult ordered by similarity (highest first)
"""
logger.warning(
"Using brute-force vector search (hnswlib not available). "
"This may cause high memory usage for large indexes. "
"Install hnswlib for better performance: pip install hnswlib"
)
with self._cache_lock:
# Refresh cache if needed
if self._embedding_matrix is None:
if not self._refresh_cache():
return [] # No data
# Vectorized cosine similarity
query_vec = query_vec.reshape(1, -1)
query_norm = np.linalg.norm(query_vec)
if query_norm == 0:
return []
# Compute all similarities at once: (N,) scores
# similarity = (A @ B.T) / (||A|| * ||B||)
dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()
scores = dot_products / (self._embedding_norms.flatten() * query_norm)
# Filter by min_score and get top-k indices
valid_mask = scores >= min_score
valid_indices = np.where(valid_mask)[0]
if len(valid_indices) == 0:
return []
# Sort by score descending and take top_k
valid_scores = scores[valid_indices]
sorted_order = np.argsort(valid_scores)[::-1][:top_k]
top_indices = valid_indices[sorted_order]
top_scores = valid_scores[sorted_order]
# Get chunk IDs for top results
top_ids = [self._chunk_ids[i] for i in top_indices]
# Fetch content only for top-k results (lazy loading)
results = self._fetch_results_by_ids(
top_ids, top_scores.tolist(), return_full_content
)
return results
def _fetch_results_by_ids(
self,
chunk_ids: List[int],
scores: List[float],
return_full_content: bool,
) -> List[SearchResult]:
"""Fetch full result data for specific chunk IDs.
Args:
chunk_ids: List of chunk IDs to fetch.
scores: Corresponding similarity scores.
return_full_content: Whether to include full content.
Returns:
List of SearchResult objects.
"""
if not chunk_ids:
return []
# Build parameterized query for IN clause
placeholders = ",".join("?" * len(chunk_ids))
query = f"""
SELECT id, file_path, content, metadata
FROM semantic_chunks
WHERE id IN ({placeholders})
"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA mmap_size = 30000000000")
rows = conn.execute(query, chunk_ids).fetchall()
# Build ID -> row mapping
id_to_row = {r[0]: r for r in rows}
results = []
for chunk_id, score in zip(chunk_ids, scores):
row = id_to_row.get(chunk_id)
if not row:
continue
_, file_path, content, metadata_json = row
metadata = json.loads(metadata_json) if metadata_json else {}
# Build excerpt (short preview)
excerpt = content[:200] + "..." if len(content) > 200 else content
# Extract symbol information from metadata
symbol_name = metadata.get("symbol_name")
symbol_kind = metadata.get("symbol_kind")
start_line = metadata.get("start_line")
end_line = metadata.get("end_line")
# Build Symbol object if we have symbol info
symbol = None
if symbol_name and symbol_kind and start_line and end_line:
try:
from codexlens.entities import Symbol
symbol = Symbol(
name=symbol_name,
kind=symbol_kind,
range=(start_line, end_line)
)
except Exception:
pass
results.append(SearchResult(
path=file_path,
score=score,
excerpt=excerpt,
content=content if return_full_content else None,
symbol=symbol,
metadata=metadata,
start_line=start_line,
end_line=end_line,
symbol_name=symbol_name,
symbol_kind=symbol_kind,
))
return results
def count_chunks(self) -> int:
"""Count total chunks in store."""
with sqlite3.connect(self.db_path) as conn:
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
return row[0] if row else 0
def clear_cache(self) -> None:
"""Manually clear the embedding cache."""
self._invalidate_cache()
@property
def ann_available(self) -> bool:
"""Check if ANN index is available and ready."""
return (
HNSWLIB_AVAILABLE
and self._ann_index is not None
and self._ann_index.is_loaded
)
@property
def ann_count(self) -> int:
"""Get number of vectors in ANN index."""
if self._ann_index is not None:
return self._ann_index.count()
return 0
def close(self) -> None:
"""Close the vector store and release resources.
This ensures SQLite connections are closed and ANN index is cleared,
allowing temporary files to be deleted on Windows.
"""
with self._cache_lock:
self._embedding_matrix = None
self._embedding_norms = None
self._chunk_ids = None
with self._ann_write_lock:
self._ann_index = None
def __enter__(self) -> "VectorStore":
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit - close resources."""
self.close()