diff --git a/codex-lens/src/codexlens/semantic/vector_store.py b/codex-lens/src/codexlens/semantic/vector_store.py index 0f6d58f0..4a158bfb 100644 --- a/codex-lens/src/codexlens/semantic/vector_store.py +++ b/codex-lens/src/codexlens/semantic/vector_store.py @@ -11,6 +11,7 @@ from __future__ import annotations import json import logging +import sys import sqlite3 import threading from pathlib import Path @@ -39,6 +40,24 @@ logger = logging.getLogger(__name__) # Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms). EPSILON = 1e-10 +# SQLite INTEGER PRIMARY KEY uses signed 64-bit rowids. +SQLITE_INTEGER_MAX = (1 << 63) - 1 + + +def _validate_chunk_id_range(start_id: int, count: int) -> None: + """Validate that a batch insert can safely generate sequential chunk IDs.""" + if count <= 0: + return + + last_id = start_id + count - 1 + if last_id > sys.maxsize or last_id > SQLITE_INTEGER_MAX: + raise ValueError( + "Chunk ID range overflow: " + f"start_id={start_id}, count={count} would allocate up to {last_id}, " + f"exceeding limits (sys.maxsize={sys.maxsize}, sqlite_max={SQLITE_INTEGER_MAX}). " + "Consider cleaning up the index database or creating a new index database." + ) + def _cosine_similarity(a: List[float], b: List[float]) -> float: """Compute cosine similarity between two vectors.""" @@ -465,6 +484,8 @@ class VectorStore: if not chunks_with_paths: return [] + batch_size = len(chunks_with_paths) + # Prepare batch data batch_data = [] embeddings_list = [] @@ -487,6 +508,8 @@ class VectorStore: row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() start_id = (row[0] or 0) + 1 + _validate_chunk_id_range(start_id, batch_size) + conn.executemany( """ INSERT INTO semantic_chunks (file_path, content, embedding, metadata) @@ -496,7 +519,7 @@ class VectorStore: ) conn.commit() # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + len(chunks_with_paths))) + ids = list(range(start_id, start_id + batch_size)) # Handle ANN index updates if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])): @@ -543,6 +566,8 @@ class VectorStore: if not chunks_with_paths: return [] + batch_size = len(chunks_with_paths) + if len(chunks_with_paths) != embeddings_matrix.shape[0]: raise ValueError( f"Mismatch: {len(chunks_with_paths)} chunks but " @@ -566,6 +591,8 @@ class VectorStore: row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() start_id = (row[0] or 0) + 1 + _validate_chunk_id_range(start_id, batch_size) + conn.executemany( """ INSERT INTO semantic_chunks (file_path, content, embedding, metadata) @@ -575,7 +602,7 @@ class VectorStore: ) conn.commit() # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + len(chunks_with_paths))) + ids = list(range(start_id, start_id + batch_size)) # Handle ANN index updates if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]): diff --git a/codex-lens/tests/test_vector_store.py b/codex-lens/tests/test_vector_store.py index 3fbe25a2..717d75a2 100644 --- a/codex-lens/tests/test_vector_store.py +++ b/codex-lens/tests/test_vector_store.py @@ -1,3 +1,5 @@ +import sqlite3 +import sys import tempfile import threading import time @@ -251,3 +253,67 @@ def test_search_with_ann_valid_results(monkeypatch: pytest.MonkeyPatch, temp_db: results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False) assert [r.path for r in results] == ["a.py"] assert results[0].score == pytest.approx(1.0) + + +def test_add_chunks_batch_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: + """add_chunks_batch should fail fast when generated IDs would exceed SQLite/sys bounds.""" + monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) + store = VectorStore(temp_db) + + seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes() + with sqlite3.connect(store.db_path) as conn: + conn.execute( + "INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)", + (sys.maxsize - 5, "seed.py", "seed", seed_embedding, None), + ) + conn.commit() + + chunks_with_paths: list[tuple[SemanticChunk, str]] = [] + for i in range(10): + chunks_with_paths.append( + ( + SemanticChunk(content=f"chunk {i}", metadata={}, embedding=[1.0, 0.0, 0.0]), + f"file_{i}.py", + ) + ) + + with pytest.raises(ValueError, match=r"Chunk ID range overflow"): + store.add_chunks_batch(chunks_with_paths) + + +def test_add_chunks_batch_generates_sequential_ids(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: + """add_chunks_batch should return sequential IDs for a fresh store.""" + monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) + store = VectorStore(temp_db) + + chunks_with_paths = [ + (SemanticChunk(content="chunk A", metadata={}, embedding=[1.0, 0.0, 0.0]), "a.py"), + (SemanticChunk(content="chunk B", metadata={}, embedding=[0.0, 1.0, 0.0]), "b.py"), + ] + + ids = store.add_chunks_batch(chunks_with_paths, update_ann=False) + assert ids == [1, 2] + assert store.count_chunks() == 2 + + +def test_add_chunks_batch_numpy_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: + """add_chunks_batch_numpy should fail fast when generated IDs would exceed SQLite/sys bounds.""" + monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) + store = VectorStore(temp_db) + + seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes() + with sqlite3.connect(store.db_path) as conn: + conn.execute( + "INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)", + (sys.maxsize - 5, "seed.py", "seed", seed_embedding, None), + ) + conn.commit() + + chunks_with_paths = [ + (SemanticChunk(content=f"chunk {i}", metadata={}), f"file_{i}.py") + for i in range(10) + ] + embeddings = np.random.randn(10, 3).astype(np.float32) + + with pytest.raises(ValueError, match=r"Chunk ID range overflow"): + store.add_chunks_batch_numpy(chunks_with_paths, embeddings)