mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
fix(vector_store): add bounds checking for chunk ID generation
Prevents potential integer overflow when start_id is near sys.maxsize. Adds validation before range() calculation in batch insert methods. Fixes: ISS-1766921318981-6 Solution-ID: SOL-1735386000-6 Issue-ID: ISS-1766921318981-6 Task-ID: T1
This commit is contained in:
@@ -11,6 +11,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -39,6 +40,24 @@ logger = logging.getLogger(__name__)
|
|||||||
# Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms).
|
# Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms).
|
||||||
EPSILON = 1e-10
|
EPSILON = 1e-10
|
||||||
|
|
||||||
|
# SQLite INTEGER PRIMARY KEY uses signed 64-bit rowids.
|
||||||
|
SQLITE_INTEGER_MAX = (1 << 63) - 1
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_chunk_id_range(start_id: int, count: int) -> None:
|
||||||
|
"""Validate that a batch insert can safely generate sequential chunk IDs."""
|
||||||
|
if count <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
last_id = start_id + count - 1
|
||||||
|
if last_id > sys.maxsize or last_id > SQLITE_INTEGER_MAX:
|
||||||
|
raise ValueError(
|
||||||
|
"Chunk ID range overflow: "
|
||||||
|
f"start_id={start_id}, count={count} would allocate up to {last_id}, "
|
||||||
|
f"exceeding limits (sys.maxsize={sys.maxsize}, sqlite_max={SQLITE_INTEGER_MAX}). "
|
||||||
|
"Consider cleaning up the index database or creating a new index database."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
||||||
"""Compute cosine similarity between two vectors."""
|
"""Compute cosine similarity between two vectors."""
|
||||||
@@ -465,6 +484,8 @@ class VectorStore:
|
|||||||
if not chunks_with_paths:
|
if not chunks_with_paths:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
batch_size = len(chunks_with_paths)
|
||||||
|
|
||||||
# Prepare batch data
|
# Prepare batch data
|
||||||
batch_data = []
|
batch_data = []
|
||||||
embeddings_list = []
|
embeddings_list = []
|
||||||
@@ -487,6 +508,8 @@ class VectorStore:
|
|||||||
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
||||||
start_id = (row[0] or 0) + 1
|
start_id = (row[0] or 0) + 1
|
||||||
|
|
||||||
|
_validate_chunk_id_range(start_id, batch_size)
|
||||||
|
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
||||||
@@ -496,7 +519,7 @@ class VectorStore:
|
|||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
# Calculate inserted IDs based on starting ID
|
# Calculate inserted IDs based on starting ID
|
||||||
ids = list(range(start_id, start_id + len(chunks_with_paths)))
|
ids = list(range(start_id, start_id + batch_size))
|
||||||
|
|
||||||
# Handle ANN index updates
|
# Handle ANN index updates
|
||||||
if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):
|
if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):
|
||||||
@@ -543,6 +566,8 @@ class VectorStore:
|
|||||||
if not chunks_with_paths:
|
if not chunks_with_paths:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
batch_size = len(chunks_with_paths)
|
||||||
|
|
||||||
if len(chunks_with_paths) != embeddings_matrix.shape[0]:
|
if len(chunks_with_paths) != embeddings_matrix.shape[0]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Mismatch: {len(chunks_with_paths)} chunks but "
|
f"Mismatch: {len(chunks_with_paths)} chunks but "
|
||||||
@@ -566,6 +591,8 @@ class VectorStore:
|
|||||||
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
||||||
start_id = (row[0] or 0) + 1
|
start_id = (row[0] or 0) + 1
|
||||||
|
|
||||||
|
_validate_chunk_id_range(start_id, batch_size)
|
||||||
|
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"""
|
"""
|
||||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
||||||
@@ -575,7 +602,7 @@ class VectorStore:
|
|||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
# Calculate inserted IDs based on starting ID
|
# Calculate inserted IDs based on starting ID
|
||||||
ids = list(range(start_id, start_id + len(chunks_with_paths)))
|
ids = list(range(start_id, start_id + batch_size))
|
||||||
|
|
||||||
# Handle ANN index updates
|
# Handle ANN index updates
|
||||||
if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):
|
if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
@@ -251,3 +253,67 @@ def test_search_with_ann_valid_results(monkeypatch: pytest.MonkeyPatch, temp_db:
|
|||||||
results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False)
|
results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False)
|
||||||
assert [r.path for r in results] == ["a.py"]
|
assert [r.path for r in results] == ["a.py"]
|
||||||
assert results[0].score == pytest.approx(1.0)
|
assert results[0].score == pytest.approx(1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_chunks_batch_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None:
|
||||||
|
"""add_chunks_batch should fail fast when generated IDs would exceed SQLite/sys bounds."""
|
||||||
|
monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False)
|
||||||
|
store = VectorStore(temp_db)
|
||||||
|
|
||||||
|
seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes()
|
||||||
|
with sqlite3.connect(store.db_path) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)",
|
||||||
|
(sys.maxsize - 5, "seed.py", "seed", seed_embedding, None),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
chunks_with_paths: list[tuple[SemanticChunk, str]] = []
|
||||||
|
for i in range(10):
|
||||||
|
chunks_with_paths.append(
|
||||||
|
(
|
||||||
|
SemanticChunk(content=f"chunk {i}", metadata={}, embedding=[1.0, 0.0, 0.0]),
|
||||||
|
f"file_{i}.py",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match=r"Chunk ID range overflow"):
|
||||||
|
store.add_chunks_batch(chunks_with_paths)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_chunks_batch_generates_sequential_ids(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None:
|
||||||
|
"""add_chunks_batch should return sequential IDs for a fresh store."""
|
||||||
|
monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False)
|
||||||
|
store = VectorStore(temp_db)
|
||||||
|
|
||||||
|
chunks_with_paths = [
|
||||||
|
(SemanticChunk(content="chunk A", metadata={}, embedding=[1.0, 0.0, 0.0]), "a.py"),
|
||||||
|
(SemanticChunk(content="chunk B", metadata={}, embedding=[0.0, 1.0, 0.0]), "b.py"),
|
||||||
|
]
|
||||||
|
|
||||||
|
ids = store.add_chunks_batch(chunks_with_paths, update_ann=False)
|
||||||
|
assert ids == [1, 2]
|
||||||
|
assert store.count_chunks() == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_chunks_batch_numpy_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None:
|
||||||
|
"""add_chunks_batch_numpy should fail fast when generated IDs would exceed SQLite/sys bounds."""
|
||||||
|
monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False)
|
||||||
|
store = VectorStore(temp_db)
|
||||||
|
|
||||||
|
seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes()
|
||||||
|
with sqlite3.connect(store.db_path) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)",
|
||||||
|
(sys.maxsize - 5, "seed.py", "seed", seed_embedding, None),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
chunks_with_paths = [
|
||||||
|
(SemanticChunk(content=f"chunk {i}", metadata={}), f"file_{i}.py")
|
||||||
|
for i in range(10)
|
||||||
|
]
|
||||||
|
embeddings = np.random.randn(10, 3).astype(np.float32)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match=r"Chunk ID range overflow"):
|
||||||
|
store.add_chunks_batch_numpy(chunks_with_paths, embeddings)
|
||||||
|
|||||||
Reference in New Issue
Block a user