mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
744 lines
25 KiB
Python
744 lines
25 KiB
Python
"""Tests for ANN (Approximate Nearest Neighbor) index using HNSW."""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
# Skip all tests if semantic dependencies not available
|
|
pytest.importorskip("numpy")
|
|
|
|
|
|
def _hnswlib_available() -> bool:
|
|
"""Check if hnswlib is available."""
|
|
try:
|
|
import hnswlib
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
class TestANNIndex:
|
|
"""Test suite for ANNIndex class."""
|
|
|
|
@pytest.fixture
|
|
def temp_db(self):
|
|
"""Create a temporary database file."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
yield Path(tmpdir) / "_index.db"
|
|
|
|
@pytest.fixture
|
|
def sample_vectors(self):
|
|
"""Generate sample vectors for testing."""
|
|
import numpy as np
|
|
np.random.seed(42)
|
|
# 100 vectors of dimension 384 (matches fast model)
|
|
return np.random.randn(100, 384).astype(np.float32)
|
|
|
|
@pytest.fixture
|
|
def sample_ids(self):
|
|
"""Generate sample IDs."""
|
|
return list(range(1, 101))
|
|
|
|
def test_import_check(self):
|
|
"""Test that HNSWLIB_AVAILABLE flag is set correctly."""
|
|
try:
|
|
from codexlens.semantic.ann_index import HNSWLIB_AVAILABLE
|
|
# Should be True if hnswlib is installed, False otherwise
|
|
assert isinstance(HNSWLIB_AVAILABLE, bool)
|
|
except ImportError:
|
|
pytest.skip("ann_index module not available")
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_create_index(self, temp_db):
|
|
"""Test creating a new ANN index."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
assert index.dim == 384
|
|
assert index.count() == 0
|
|
assert not index.is_loaded
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_add_vectors(self, temp_db, sample_vectors, sample_ids):
|
|
"""Test adding vectors to the index."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
index.add_vectors(sample_ids, sample_vectors)
|
|
|
|
assert index.count() == 100
|
|
assert index.is_loaded
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_search(self, temp_db, sample_vectors, sample_ids):
|
|
"""Test searching for similar vectors."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
index.add_vectors(sample_ids, sample_vectors)
|
|
|
|
# Search for the first vector - should find itself
|
|
query = sample_vectors[0]
|
|
ids, distances = index.search(query, top_k=5)
|
|
|
|
assert len(ids) == 5
|
|
assert len(distances) == 5
|
|
# First result should be the query vector itself (or very close)
|
|
assert ids[0] == 1 # ID of first vector
|
|
assert distances[0] < 0.01 # Very small distance (almost identical)
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_save_and_load(self, temp_db, sample_vectors, sample_ids):
|
|
"""Test saving and loading index from disk."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
# Create and save index
|
|
index1 = ANNIndex(temp_db, dim=384)
|
|
index1.add_vectors(sample_ids, sample_vectors)
|
|
index1.save()
|
|
|
|
# Check that file was created (new naming: {db_stem}_vectors.hnsw)
|
|
hnsw_path = temp_db.parent / f"{temp_db.stem}_vectors.hnsw"
|
|
assert hnsw_path.exists()
|
|
|
|
# Load in new instance
|
|
index2 = ANNIndex(temp_db, dim=384)
|
|
loaded = index2.load()
|
|
|
|
assert loaded is True
|
|
assert index2.count() == 100
|
|
assert index2.is_loaded
|
|
|
|
# Verify search still works
|
|
query = sample_vectors[0]
|
|
ids, distances = index2.search(query, top_k=5)
|
|
assert ids[0] == 1
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_load_nonexistent(self, temp_db):
|
|
"""Test loading when index file doesn't exist."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
loaded = index.load()
|
|
|
|
assert loaded is False
|
|
assert not index.is_loaded
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_remove_vectors(self, temp_db, sample_vectors, sample_ids):
|
|
"""Test removing vectors from the index."""
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
index.add_vectors(sample_ids, sample_vectors)
|
|
|
|
# Remove first 10 vectors
|
|
index.remove_vectors(list(range(1, 11)))
|
|
|
|
# Search for removed vector - should not be in results
|
|
query = sample_vectors[0]
|
|
ids, distances = index.search(query, top_k=5)
|
|
|
|
# ID 1 should not be in results (soft deleted)
|
|
assert 1 not in ids
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_incremental_add(self, temp_db):
|
|
"""Test adding vectors incrementally."""
|
|
import numpy as np
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
|
|
# Add first batch
|
|
vectors1 = np.random.randn(50, 384).astype(np.float32)
|
|
index.add_vectors(list(range(1, 51)), vectors1)
|
|
assert index.count() == 50
|
|
|
|
# Add second batch
|
|
vectors2 = np.random.randn(50, 384).astype(np.float32)
|
|
index.add_vectors(list(range(51, 101)), vectors2)
|
|
assert index.count() == 100
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_search_empty_index(self, temp_db):
|
|
"""Test searching an empty index."""
|
|
import numpy as np
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
query = np.random.randn(384).astype(np.float32)
|
|
|
|
ids, distances = index.search(query, top_k=5)
|
|
|
|
assert ids == []
|
|
assert distances == []
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_invalid_dimension(self, temp_db, sample_vectors, sample_ids):
|
|
"""Test adding vectors with wrong dimension."""
|
|
import numpy as np
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
|
|
# Try to add vectors with wrong dimension
|
|
wrong_vectors = np.random.randn(10, 768).astype(np.float32)
|
|
with pytest.raises(ValueError, match="dimension"):
|
|
index.add_vectors(list(range(1, 11)), wrong_vectors)
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_auto_resize(self, temp_db):
|
|
"""Test that index automatically resizes when capacity is exceeded."""
|
|
import numpy as np
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
index = ANNIndex(temp_db, dim=384)
|
|
# Override initial capacity to test resize
|
|
index._max_elements = 100
|
|
|
|
# Add more vectors than initial capacity
|
|
vectors = np.random.randn(150, 384).astype(np.float32)
|
|
index.add_vectors(list(range(1, 151)), vectors)
|
|
|
|
assert index.count() == 150
|
|
assert index._max_elements >= 150
|
|
|
|
|
|
class TestVectorStoreWithANN:
|
|
"""Test VectorStore integration with ANN index."""
|
|
|
|
@pytest.fixture
|
|
def temp_db(self):
|
|
"""Create a temporary database file."""
|
|
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
|
|
yield Path(tmpdir) / "_index.db"
|
|
|
|
@pytest.fixture
|
|
def sample_chunks(self):
|
|
"""Create sample semantic chunks with embeddings."""
|
|
import numpy as np
|
|
from codexlens.entities import SemanticChunk
|
|
|
|
np.random.seed(42)
|
|
chunks = []
|
|
for i in range(10):
|
|
chunk = SemanticChunk(
|
|
content=f"def function_{i}(): pass",
|
|
metadata={"symbol_name": f"function_{i}", "symbol_kind": "function"},
|
|
)
|
|
chunk.embedding = np.random.randn(384).astype(np.float32).tolist()
|
|
chunks.append(chunk)
|
|
return chunks
|
|
|
|
def test_vector_store_with_ann(self, temp_db, sample_chunks):
|
|
"""Test VectorStore using ANN index for search."""
|
|
from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE
|
|
|
|
store = VectorStore(temp_db)
|
|
|
|
# Add chunks
|
|
ids = store.add_chunks(sample_chunks, "test.py")
|
|
assert len(ids) == 10
|
|
|
|
# Check ANN status
|
|
if HNSWLIB_AVAILABLE:
|
|
assert store.ann_available or store.ann_count >= 0
|
|
|
|
# Search
|
|
query_embedding = sample_chunks[0].embedding
|
|
results = store.search_similar(query_embedding, top_k=5)
|
|
|
|
assert len(results) <= 5
|
|
if results:
|
|
# First result should have high similarity
|
|
assert results[0].score > 0.9
|
|
|
|
def test_vector_store_rebuild_ann(self, temp_db, sample_chunks):
|
|
"""Test rebuilding ANN index from SQLite data."""
|
|
from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE
|
|
|
|
if not HNSWLIB_AVAILABLE:
|
|
pytest.skip("hnswlib not installed")
|
|
|
|
store = VectorStore(temp_db)
|
|
|
|
# Add chunks
|
|
store.add_chunks(sample_chunks, "test.py")
|
|
|
|
# Rebuild ANN index
|
|
count = store.rebuild_ann_index()
|
|
assert count == 10
|
|
|
|
# Verify search works
|
|
query_embedding = sample_chunks[0].embedding
|
|
results = store.search_similar(query_embedding, top_k=5)
|
|
assert len(results) > 0
|
|
|
|
def test_vector_store_delete_updates_ann(self, temp_db, sample_chunks):
|
|
"""Test that deleting chunks updates ANN index."""
|
|
from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE
|
|
|
|
if not HNSWLIB_AVAILABLE:
|
|
pytest.skip("hnswlib not installed")
|
|
|
|
store = VectorStore(temp_db)
|
|
|
|
# Add chunks for two files
|
|
store.add_chunks(sample_chunks[:5], "file1.py")
|
|
store.add_chunks(sample_chunks[5:], "file2.py")
|
|
|
|
initial_count = store.count_chunks()
|
|
assert initial_count == 10
|
|
|
|
# Delete one file's chunks
|
|
deleted = store.delete_file_chunks("file1.py")
|
|
assert deleted == 5
|
|
|
|
# Verify count
|
|
assert store.count_chunks() == 5
|
|
|
|
def test_vector_store_batch_add(self, temp_db, sample_chunks):
|
|
"""Test batch adding chunks from multiple files."""
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
|
|
store = VectorStore(temp_db)
|
|
|
|
# Prepare chunks with paths
|
|
chunks_with_paths = [
|
|
(chunk, f"file{i % 3}.py")
|
|
for i, chunk in enumerate(sample_chunks)
|
|
]
|
|
|
|
# Batch add
|
|
ids = store.add_chunks_batch(chunks_with_paths)
|
|
assert len(ids) == 10
|
|
|
|
# Verify
|
|
assert store.count_chunks() == 10
|
|
|
|
def test_vector_store_fallback_search(self, temp_db, sample_chunks):
|
|
"""Test that search falls back to brute-force when ANN unavailable."""
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
|
|
store = VectorStore(temp_db)
|
|
store.add_chunks(sample_chunks, "test.py")
|
|
|
|
# Force disable ANN
|
|
store._ann_index = None
|
|
|
|
# Search should still work (brute-force fallback)
|
|
query_embedding = sample_chunks[0].embedding
|
|
results = store.search_similar(query_embedding, top_k=5)
|
|
|
|
assert len(results) > 0
|
|
assert results[0].score > 0.9
|
|
|
|
|
|
class TestSearchAccuracy:
|
|
"""Test search accuracy comparing ANN vs brute-force."""
|
|
|
|
@pytest.fixture
|
|
def temp_db(self):
|
|
"""Create a temporary database file."""
|
|
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
|
|
yield Path(tmpdir) / "_index.db"
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_ann_vs_brute_force_recall(self, temp_db):
|
|
"""Test that ANN search has high recall compared to brute-force."""
|
|
import numpy as np
|
|
from codexlens.entities import SemanticChunk
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
|
|
np.random.seed(42)
|
|
|
|
# Create larger dataset
|
|
chunks = []
|
|
for i in range(100):
|
|
chunk = SemanticChunk(
|
|
content=f"code block {i}",
|
|
metadata={"chunk_id": i},
|
|
)
|
|
chunk.embedding = np.random.randn(384).astype(np.float32).tolist()
|
|
chunks.append(chunk)
|
|
|
|
store = VectorStore(temp_db)
|
|
store.add_chunks(chunks, "test.py")
|
|
|
|
# Get brute-force results
|
|
store._ann_index = None # Force brute-force
|
|
store._invalidate_cache() # Clear cache to force refresh
|
|
query = chunks[0].embedding
|
|
bf_results = store.search_similar(query, top_k=10)
|
|
# Use chunk_id from metadata for comparison (more reliable than path+score)
|
|
bf_chunk_ids = {r.metadata.get("chunk_id") for r in bf_results}
|
|
|
|
# Rebuild ANN and get ANN results
|
|
store.rebuild_ann_index()
|
|
ann_results = store.search_similar(query, top_k=10)
|
|
ann_chunk_ids = {r.metadata.get("chunk_id") for r in ann_results}
|
|
|
|
# Calculate recall (how many brute-force results are in ANN results)
|
|
# ANN should find at least 80% of the same results
|
|
overlap = len(bf_chunk_ids & ann_chunk_ids)
|
|
recall = overlap / len(bf_chunk_ids) if bf_chunk_ids else 1.0
|
|
|
|
assert recall >= 0.8, f"ANN recall too low: {recall} (overlap: {overlap}, bf: {bf_chunk_ids}, ann: {ann_chunk_ids})"
|
|
|
|
|
|
|
|
class TestBinaryANNIndex:
|
|
"""Test suite for BinaryANNIndex class (Hamming distance-based search)."""
|
|
|
|
@pytest.fixture
|
|
def temp_db(self):
|
|
"""Create a temporary database file."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
yield Path(tmpdir) / "_index.db"
|
|
|
|
@pytest.fixture
|
|
def sample_binary_vectors(self):
|
|
"""Generate sample binary vectors for testing."""
|
|
import numpy as np
|
|
np.random.seed(42)
|
|
# 100 binary vectors of dimension 256 (packed as 32 bytes each)
|
|
binary_unpacked = (np.random.rand(100, 256) > 0.5).astype(np.uint8)
|
|
packed = [np.packbits(v).tobytes() for v in binary_unpacked]
|
|
return packed, binary_unpacked
|
|
|
|
@pytest.fixture
|
|
def sample_ids(self):
|
|
"""Generate sample IDs."""
|
|
return list(range(1, 101))
|
|
|
|
def test_create_binary_index(self, temp_db):
|
|
"""Test creating a new Binary ANN index."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
assert index.dim == 256
|
|
assert index.packed_dim == 32
|
|
assert index.count() == 0
|
|
assert not index.is_loaded
|
|
|
|
def test_invalid_dimension(self, temp_db):
|
|
"""Test that invalid dimensions are rejected."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
# Dimension must be divisible by 8
|
|
with pytest.raises(ValueError, match="divisible by 8"):
|
|
BinaryANNIndex(temp_db, dim=255)
|
|
|
|
with pytest.raises(ValueError, match="positive"):
|
|
BinaryANNIndex(temp_db, dim=0)
|
|
|
|
def test_add_packed_vectors(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test adding packed binary vectors to the index."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
assert index.count() == 100
|
|
assert index.is_loaded
|
|
|
|
def test_add_numpy_vectors(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test adding unpacked numpy binary vectors."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
import numpy as np
|
|
|
|
_, unpacked = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors_numpy(sample_ids, unpacked)
|
|
|
|
assert index.count() == 100
|
|
|
|
def test_search_packed(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test searching with packed binary query."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
# Search for the first vector - should find itself with distance 0
|
|
query = packed[0]
|
|
ids, distances = index.search(query, top_k=5)
|
|
|
|
assert len(ids) == 5
|
|
assert len(distances) == 5
|
|
# First result should be the query vector itself
|
|
assert ids[0] == 1
|
|
assert distances[0] == 0 # Hamming distance of 0 (identical)
|
|
|
|
def test_search_numpy(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test searching with unpacked numpy query."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, unpacked = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
# Search for the first vector using numpy interface
|
|
query = unpacked[0]
|
|
ids, distances = index.search_numpy(query, top_k=5)
|
|
|
|
assert len(ids) == 5
|
|
assert ids[0] == 1
|
|
assert distances[0] == 0
|
|
|
|
def test_search_batch(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test batch search with multiple queries."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
# Search for first 3 vectors
|
|
queries = packed[:3]
|
|
results = index.search_batch(queries, top_k=5)
|
|
|
|
assert len(results) == 3
|
|
# Each result should find itself first
|
|
for i, (ids, dists) in enumerate(results):
|
|
assert ids[0] == i + 1
|
|
assert dists[0] == 0
|
|
|
|
def test_hamming_distance_ordering(self, temp_db):
|
|
"""Test that results are ordered by Hamming distance."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
import numpy as np
|
|
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
|
|
# Create vectors with known Hamming distances from a query
|
|
query = np.zeros(256, dtype=np.uint8) # All zeros
|
|
v1 = np.zeros(256, dtype=np.uint8) # Distance 0
|
|
v2 = np.zeros(256, dtype=np.uint8); v2[:10] = 1 # Distance 10
|
|
v3 = np.zeros(256, dtype=np.uint8); v3[:50] = 1 # Distance 50
|
|
v4 = np.ones(256, dtype=np.uint8) # Distance 256
|
|
|
|
index.add_vectors_numpy([1, 2, 3, 4], np.array([v1, v2, v3, v4]))
|
|
|
|
query_packed = np.packbits(query).tobytes()
|
|
ids, distances = index.search(query_packed, top_k=4)
|
|
|
|
assert ids == [1, 2, 3, 4]
|
|
assert distances == [0, 10, 50, 256]
|
|
|
|
def test_save_and_load(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test saving and loading binary index from disk."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
|
|
# Create and save index
|
|
index1 = BinaryANNIndex(temp_db, dim=256)
|
|
index1.add_vectors(sample_ids, packed)
|
|
index1.save()
|
|
|
|
# Check that file was created
|
|
binary_path = temp_db.parent / f"{temp_db.stem}_binary_vectors.bin"
|
|
assert binary_path.exists()
|
|
|
|
# Load in new instance
|
|
index2 = BinaryANNIndex(temp_db, dim=256)
|
|
loaded = index2.load()
|
|
|
|
assert loaded is True
|
|
assert index2.count() == 100
|
|
assert index2.is_loaded
|
|
|
|
# Verify search still works
|
|
query = packed[0]
|
|
ids, distances = index2.search(query, top_k=5)
|
|
assert ids[0] == 1
|
|
assert distances[0] == 0
|
|
|
|
def test_load_nonexistent(self, temp_db):
|
|
"""Test loading when index file doesn't exist."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
loaded = index.load()
|
|
|
|
assert loaded is False
|
|
assert not index.is_loaded
|
|
|
|
def test_remove_vectors(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test removing vectors from the index."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
# Remove first 10 vectors
|
|
index.remove_vectors(list(range(1, 11)))
|
|
|
|
assert index.count() == 90
|
|
|
|
# Removed vectors should not be findable
|
|
query = packed[0]
|
|
ids, _ = index.search(query, top_k=100)
|
|
for removed_id in range(1, 11):
|
|
assert removed_id not in ids
|
|
|
|
def test_get_vector(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test retrieving a specific vector by ID."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
|
|
# Get existing vector
|
|
vec = index.get_vector(1)
|
|
assert vec == packed[0]
|
|
|
|
# Get non-existing vector
|
|
vec = index.get_vector(9999)
|
|
assert vec is None
|
|
|
|
def test_clear(self, temp_db, sample_binary_vectors, sample_ids):
|
|
"""Test clearing all vectors from the index."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
packed, _ = sample_binary_vectors
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
index.add_vectors(sample_ids, packed)
|
|
assert index.count() == 100
|
|
|
|
index.clear()
|
|
assert index.count() == 0
|
|
assert not index.is_loaded
|
|
|
|
def test_search_empty_index(self, temp_db):
|
|
"""Test searching an empty index."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
import numpy as np
|
|
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
query = np.packbits(np.zeros(256, dtype=np.uint8)).tobytes()
|
|
|
|
ids, distances = index.search(query, top_k=5)
|
|
|
|
assert ids == []
|
|
assert distances == []
|
|
|
|
def test_update_existing_vector(self, temp_db):
|
|
"""Test updating an existing vector with new data."""
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
import numpy as np
|
|
|
|
index = BinaryANNIndex(temp_db, dim=256)
|
|
|
|
# Add initial vector
|
|
v1 = np.zeros(256, dtype=np.uint8)
|
|
index.add_vectors_numpy([1], v1.reshape(1, -1))
|
|
|
|
# Update with different vector
|
|
v2 = np.ones(256, dtype=np.uint8)
|
|
index.add_vectors_numpy([1], v2.reshape(1, -1))
|
|
|
|
# Count should still be 1
|
|
assert index.count() == 1
|
|
|
|
# Retrieved vector should be the updated one
|
|
stored = index.get_vector(1)
|
|
expected = np.packbits(v2).tobytes()
|
|
assert stored == expected
|
|
|
|
|
|
class TestCreateAnnIndexFactory:
|
|
"""Test suite for create_ann_index factory function."""
|
|
|
|
@pytest.fixture
|
|
def temp_db(self):
|
|
"""Create a temporary database file."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
yield Path(tmpdir) / "_index.db"
|
|
|
|
@pytest.mark.skipif(
|
|
not _hnswlib_available(),
|
|
reason="hnswlib not installed"
|
|
)
|
|
def test_create_hnsw_index(self, temp_db):
|
|
"""Test creating HNSW index via factory."""
|
|
from codexlens.semantic.ann_index import create_ann_index, ANNIndex
|
|
|
|
index = create_ann_index(temp_db, index_type="hnsw", dim=384)
|
|
assert isinstance(index, ANNIndex)
|
|
assert index.dim == 384
|
|
|
|
def test_create_binary_index(self, temp_db):
|
|
"""Test creating binary index via factory."""
|
|
from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
|
|
|
|
index = create_ann_index(temp_db, index_type="binary", dim=256)
|
|
assert isinstance(index, BinaryANNIndex)
|
|
assert index.dim == 256
|
|
|
|
def test_create_binary_index_default_dim(self, temp_db):
|
|
"""Test that binary index defaults to 256 dim when dense default is used."""
|
|
from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
|
|
|
|
# When dim=2048 (dense default) is passed with binary type,
|
|
# it should auto-adjust to 256
|
|
index = create_ann_index(temp_db, index_type="binary")
|
|
assert isinstance(index, BinaryANNIndex)
|
|
assert index.dim == 256
|
|
|
|
def test_invalid_index_type(self, temp_db):
|
|
"""Test that invalid index type raises error."""
|
|
from codexlens.semantic.ann_index import create_ann_index
|
|
|
|
with pytest.raises(ValueError, match="Invalid index_type"):
|
|
create_ann_index(temp_db, index_type="invalid")
|
|
|
|
def test_case_insensitive_index_type(self, temp_db):
|
|
"""Test that index_type is case-insensitive."""
|
|
from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
|
|
|
|
index = create_ann_index(temp_db, index_type="BINARY", dim=256)
|
|
assert isinstance(index, BinaryANNIndex)
|