mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
162 lines
5.1 KiB
Python
162 lines
5.1 KiB
Python
"""Unit tests for BinarySearcher - binary vector search using Hamming distance.
|
|
|
|
Tests cover:
|
|
- load: mmap file loading, DB fallback, no data scenario
|
|
- search: basic search, top_k limit, empty index
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch, mock_open
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from codexlens.search.binary_searcher import BinarySearcher
|
|
|
|
|
|
# =============================================================================
|
|
# Test Fixtures
|
|
# =============================================================================
|
|
|
|
|
|
@pytest.fixture
|
|
def temp_paths():
|
|
"""Create temporary directory structure."""
|
|
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
|
root = Path(tmpdir.name)
|
|
yield root
|
|
try:
|
|
tmpdir.cleanup()
|
|
except (PermissionError, OSError):
|
|
pass
|
|
|
|
|
|
@pytest.fixture
|
|
def binary_mmap_setup(temp_paths):
|
|
"""Create a mock memory-mapped binary vectors file with metadata."""
|
|
num_vectors = 10
|
|
dim_bytes = 32 # 256 bits = 32 bytes
|
|
|
|
# Create binary matrix
|
|
rng = np.random.default_rng(42)
|
|
binary_matrix = rng.integers(0, 256, size=(num_vectors, dim_bytes), dtype=np.uint8)
|
|
chunk_ids = list(range(100, 100 + num_vectors))
|
|
|
|
# Write mmap file
|
|
mmap_path = temp_paths / "_binary_vectors.mmap"
|
|
binary_matrix.tofile(str(mmap_path))
|
|
|
|
# Write metadata
|
|
meta_path = mmap_path.with_suffix(".meta.json")
|
|
meta = {
|
|
"shape": [num_vectors, dim_bytes],
|
|
"chunk_ids": chunk_ids,
|
|
}
|
|
with open(meta_path, "w") as f:
|
|
json.dump(meta, f)
|
|
|
|
return temp_paths, binary_matrix, chunk_ids
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: load
|
|
# =============================================================================
|
|
|
|
|
|
class TestBinarySearcherLoad:
|
|
"""Tests for BinarySearcher.load()."""
|
|
|
|
def test_load_mmap(self, binary_mmap_setup):
|
|
"""Memory-mapped file loading should succeed and mark is_memmap."""
|
|
index_root, binary_matrix, chunk_ids = binary_mmap_setup
|
|
searcher = BinarySearcher(index_root)
|
|
|
|
result = searcher.load()
|
|
|
|
assert result is True
|
|
assert searcher._loaded is True
|
|
assert searcher.is_memmap is True
|
|
assert searcher.vector_count == len(chunk_ids)
|
|
|
|
def test_load_db_fallback(self, temp_paths):
|
|
"""Should fall back to DB loading when no mmap file exists."""
|
|
searcher = BinarySearcher(temp_paths)
|
|
|
|
# Mock the DB fallback
|
|
with patch.object(searcher, "_load_from_db", return_value=True) as mock_db:
|
|
result = searcher.load()
|
|
|
|
assert result is True
|
|
mock_db.assert_called_once()
|
|
|
|
def test_load_no_data(self, temp_paths):
|
|
"""Should return False when neither mmap nor DB data available."""
|
|
searcher = BinarySearcher(temp_paths)
|
|
|
|
with patch.object(searcher, "_load_from_db", return_value=False):
|
|
result = searcher.load()
|
|
|
|
assert result is False
|
|
assert searcher._loaded is False
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: search
|
|
# =============================================================================
|
|
|
|
|
|
class TestBinarySearcherSearch:
|
|
"""Tests for BinarySearcher.search()."""
|
|
|
|
def test_search_basic(self, binary_mmap_setup):
|
|
"""Basic search should return (chunk_id, distance) tuples."""
|
|
index_root, binary_matrix, chunk_ids = binary_mmap_setup
|
|
searcher = BinarySearcher(index_root)
|
|
searcher.load()
|
|
|
|
# Create a query vector (256 dimensions, will be binarized)
|
|
rng = np.random.default_rng(99)
|
|
query_vector = rng.standard_normal(256).astype(np.float32)
|
|
|
|
results = searcher.search(query_vector, top_k=5)
|
|
|
|
assert len(results) == 5
|
|
# Results should be (chunk_id, hamming_distance) tuples
|
|
for chunk_id, distance in results:
|
|
assert isinstance(chunk_id, int)
|
|
assert isinstance(distance, int)
|
|
assert chunk_id in chunk_ids
|
|
|
|
def test_search_top_k(self, binary_mmap_setup):
|
|
"""Search should respect top_k limit."""
|
|
index_root, binary_matrix, chunk_ids = binary_mmap_setup
|
|
searcher = BinarySearcher(index_root)
|
|
searcher.load()
|
|
|
|
query_vector = np.random.default_rng(42).standard_normal(256).astype(np.float32)
|
|
|
|
results_3 = searcher.search(query_vector, top_k=3)
|
|
results_7 = searcher.search(query_vector, top_k=7)
|
|
|
|
assert len(results_3) == 3
|
|
assert len(results_7) == 7
|
|
# Results should be sorted by distance (ascending)
|
|
distances_3 = [d for _, d in results_3]
|
|
assert distances_3 == sorted(distances_3)
|
|
|
|
def test_search_empty_index(self, temp_paths):
|
|
"""Search on empty/unloaded index should return empty list."""
|
|
searcher = BinarySearcher(temp_paths)
|
|
# Do not load - index is empty
|
|
|
|
query_vector = np.zeros(256, dtype=np.float32)
|
|
|
|
with patch.object(searcher, "load", return_value=False):
|
|
results = searcher.search(query_vector, top_k=5)
|
|
|
|
assert results == []
|