mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
fix(vector_store): add parameter validation for min_score range
Validates min_score is within [0.0, 1.0] for cosine similarity. Raises ValueError for out-of-range values to prevent unexpected filtering. Fixes: ISS-1766921318981-14 Solution-ID: SOL-1735386000-14 Issue-ID: ISS-1766921318981-14 Task-ID: T1
This commit is contained in:
@@ -730,7 +730,7 @@ class VectorStore:
|
||||
Args:
|
||||
query_embedding: Query vector.
|
||||
top_k: Maximum results to return.
|
||||
min_score: Minimum similarity score (0-1).
|
||||
min_score: Minimum cosine similarity score in [0.0, 1.0].
|
||||
return_full_content: If True, return full code block content.
|
||||
|
||||
Returns:
|
||||
@@ -738,6 +738,11 @@ class VectorStore:
|
||||
"""
|
||||
query_vec = np.array(query_embedding, dtype=np.float32)
|
||||
|
||||
if not 0.0 <= min_score <= 1.0:
|
||||
raise ValueError(
|
||||
f"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity."
|
||||
)
|
||||
|
||||
# Try HNSW search first (O(log N))
|
||||
if (
|
||||
HNSWLIB_AVAILABLE
|
||||
@@ -769,7 +774,7 @@ class VectorStore:
|
||||
Args:
|
||||
query_vec: Query vector as numpy array
|
||||
top_k: Maximum results to return
|
||||
min_score: Minimum similarity score (0-1)
|
||||
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
||||
return_full_content: If True, return full code block content
|
||||
|
||||
Returns:
|
||||
@@ -820,7 +825,7 @@ class VectorStore:
|
||||
Args:
|
||||
query_vec: Query vector as numpy array
|
||||
top_k: Maximum results to return
|
||||
min_score: Minimum similarity score (0-1)
|
||||
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
||||
return_full_content: If True, return full code block content
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -7,6 +7,7 @@ import numpy as np
|
||||
import pytest
|
||||
|
||||
from codexlens.entities import SemanticChunk
|
||||
import codexlens.semantic.vector_store as vector_store_module
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
|
||||
@@ -129,3 +130,47 @@ def test_bulk_insert_mode_transitions(monkeypatch: pytest.MonkeyPatch, temp_db:
|
||||
assert store._bulk_insert_ids == []
|
||||
assert store._bulk_insert_embeddings == []
|
||||
assert dummy_ann.total_added == store.count_chunks()
|
||||
|
||||
|
||||
def test_search_similar_min_score_validation(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None:
|
||||
"""search_similar should validate min_score is within [0.0, 1.0]."""
|
||||
monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False)
|
||||
store = VectorStore(temp_db)
|
||||
|
||||
chunk_a = SemanticChunk(content="chunk A", metadata={})
|
||||
chunk_a.embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32)
|
||||
chunk_b = SemanticChunk(content="chunk B", metadata={})
|
||||
chunk_b.embedding = np.array([0.0, 1.0, 0.0], dtype=np.float32)
|
||||
store.add_chunks_batch([(chunk_a, "a.py"), (chunk_b, "b.py")])
|
||||
|
||||
query = [1.0, 0.0, 0.0]
|
||||
|
||||
with pytest.raises(ValueError, match=r"min_score.*\[0\.0, 1\.0\].*cosine"):
|
||||
store.search_similar(query, min_score=-0.5)
|
||||
|
||||
with pytest.raises(ValueError, match=r"min_score.*\[0\.0, 1\.0\].*cosine"):
|
||||
store.search_similar(query, min_score=1.5)
|
||||
|
||||
store.search_similar(query, min_score=0.0)
|
||||
store.search_similar(query, min_score=1.0)
|
||||
|
||||
results = store.search_similar(query, min_score=0.5, return_full_content=False)
|
||||
assert [r.path for r in results] == ["a.py"]
|
||||
|
||||
|
||||
def test_search_similar(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None:
|
||||
"""search_similar returns results ordered by descending similarity."""
|
||||
monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False)
|
||||
store = VectorStore(temp_db)
|
||||
|
||||
chunk_a = SemanticChunk(content="chunk A", metadata={})
|
||||
chunk_a.embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32)
|
||||
chunk_b = SemanticChunk(content="chunk B", metadata={})
|
||||
chunk_b.embedding = np.array([0.0, 1.0, 0.0], dtype=np.float32)
|
||||
store.add_chunks_batch([(chunk_a, "a.py"), (chunk_b, "b.py")])
|
||||
|
||||
results = store.search_similar([1.0, 0.0, 0.0], top_k=10, min_score=0.0, return_full_content=False)
|
||||
|
||||
assert [r.path for r in results] == ["a.py", "b.py"]
|
||||
assert results[0].score == pytest.approx(1.0)
|
||||
assert results[1].score == pytest.approx(0.0)
|
||||
|
||||
Reference in New Issue
Block a user