Claude-Code-Workflow/codex-lens/tests/test_embedding_status_root_model.py

import gc
import gc
import shutil
import sqlite3
import tempfile
import time
from pathlib import Path

import pytest

import codexlens.cli.embedding_manager as embedding_manager
from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status


@pytest.fixture
def status_temp_dir() -> Path:
    temp_path = Path(tempfile.mkdtemp())
    try:
        yield temp_path
    finally:
        gc.collect()
        for _ in range(5):
            try:
                if temp_path.exists():
                    shutil.rmtree(temp_path)
                break
            except PermissionError:
                time.sleep(0.1)


def _create_index_db(index_path: Path, files: list[str], embedded_files: list[str] | None = None) -> None:
    index_path.parent.mkdir(parents=True, exist_ok=True)
    with sqlite3.connect(index_path) as conn:
        cursor = conn.cursor()
        cursor.execute(
            """
            CREATE TABLE files (
                id INTEGER PRIMARY KEY,
                path TEXT NOT NULL UNIQUE,
                content TEXT,
                language TEXT,
                hash TEXT
            )
            """
        )
        cursor.executemany(
            "INSERT INTO files (path, content, language, hash) VALUES (?, ?, ?, ?)",
            [(file_path, "", "python", f"hash-{idx}") for idx, file_path in enumerate(files)],
        )

        if embedded_files is not None:
            cursor.execute(
                """
                CREATE TABLE semantic_chunks (
                    id INTEGER PRIMARY KEY,
                    file_path TEXT NOT NULL,
                    content TEXT,
                    embedding BLOB,
                    metadata TEXT,
                    category TEXT
                )
                """
            )
            cursor.executemany(
                "INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) VALUES (?, ?, ?, ?, ?)",
                [(file_path, "chunk", b"vec", "{}", "code") for file_path in embedded_files],
            )
        conn.commit()


def _create_vectors_meta_db(meta_path: Path, embedded_files: list[str], binary_vector_count: int = 0) -> None:
    meta_path.parent.mkdir(parents=True, exist_ok=True)
    with sqlite3.connect(meta_path) as conn:
        cursor = conn.cursor()
        cursor.execute(
            """
            CREATE TABLE chunk_metadata (
                chunk_id INTEGER PRIMARY KEY,
                file_path TEXT NOT NULL,
                content TEXT,
                start_line INTEGER,
                end_line INTEGER,
                category TEXT,
                metadata TEXT,
                source_index_db TEXT
            )
            """
        )
        cursor.execute(
            """
            CREATE TABLE binary_vectors (
                chunk_id INTEGER PRIMARY KEY,
                vector BLOB NOT NULL
            )
            """
        )
        cursor.executemany(
            """
            INSERT INTO chunk_metadata (
                chunk_id, file_path, content, start_line, end_line, category, metadata, source_index_db
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """,
            [
                (idx, file_path, "chunk", 1, 1, "code", "{}", str(meta_path.parent / "_index.db"))
                for idx, file_path in enumerate(embedded_files, start=1)
            ],
        )
        cursor.executemany(
            "INSERT INTO binary_vectors (chunk_id, vector) VALUES (?, ?)",
            [(idx, b"\x01") for idx in range(1, binary_vector_count + 1)],
        )
        conn.commit()


def test_root_status_does_not_inherit_child_embeddings(
    monkeypatch: pytest.MonkeyPatch, status_temp_dir: Path
) -> None:
    workspace = status_temp_dir / "workspace"
    workspace.mkdir()
    _create_index_db(workspace / "_index.db", ["a.py", "b.py"])
    _create_index_db(workspace / "child" / "_index.db", ["child.py"], embedded_files=["child.py"])

    monkeypatch.setattr(
        embedding_manager,
        "_get_model_info_from_index",
        lambda index_path: {
            "model_profile": "fast",
            "model_name": "unit-test-model",
            "embedding_dim": 384,
            "backend": "fastembed",
            "created_at": "2026-03-13T00:00:00Z",
            "updated_at": "2026-03-13T00:00:00Z",
        } if index_path.parent.name == "child" else None,
    )

    status = get_embeddings_status(workspace)
    assert status["success"] is True

    result = status["result"]
    assert result["coverage_percent"] == 0.0
    assert result["files_with_embeddings"] == 0
    assert result["root"]["has_embeddings"] is False
    assert result["model_info"] is None
    assert result["subtree"]["indexes_with_embeddings"] == 1
    assert result["subtree"]["coverage_percent"] > 0


def test_root_status_uses_validated_centralized_metadata(status_temp_dir: Path) -> None:
    workspace = status_temp_dir / "workspace"
    workspace.mkdir()
    _create_index_db(workspace / "_index.db", ["a.py", "b.py"])
    _create_vectors_meta_db(workspace / "_vectors_meta.db", ["a.py"])
    (workspace / "_vectors.hnsw").write_bytes(b"hnsw")

    status = get_embeddings_status(workspace)
    assert status["success"] is True

    result = status["result"]
    assert result["coverage_percent"] == 50.0
    assert result["files_with_embeddings"] == 1
    assert result["total_chunks"] == 1
    assert result["root"]["has_embeddings"] is True
    assert result["root"]["storage_mode"] == "centralized"
    assert result["centralized"]["dense_ready"] is True
    assert result["centralized"]["usable"] is True


def test_embedding_stats_summary_skips_ignored_artifact_indexes(status_temp_dir: Path) -> None:
    workspace = status_temp_dir / "workspace"
    workspace.mkdir()
    _create_index_db(workspace / "_index.db", ["root.py"])
    _create_index_db(workspace / "src" / "_index.db", ["src.py"])
    _create_index_db(workspace / "dist" / "_index.db", ["bundle.py"], embedded_files=["bundle.py"])
    _create_index_db(workspace / ".workflow" / "_index.db", ["trace.py"], embedded_files=["trace.py"])

    summary = get_embedding_stats_summary(workspace)

    assert summary["success"] is True
    result = summary["result"]
    assert result["total_indexes"] == 2
    assert {Path(item["path"]).relative_to(workspace).as_posix() for item in result["indexes"]} == {
        "_index.db",
        "src/_index.db",
    }


def test_root_status_ignores_empty_centralized_artifacts(status_temp_dir: Path) -> None:
    workspace = status_temp_dir / "workspace"
    workspace.mkdir()
    _create_index_db(workspace / "_index.db", ["a.py", "b.py"])
    _create_vectors_meta_db(workspace / "_vectors_meta.db", [])
    (workspace / "_vectors.hnsw").write_bytes(b"hnsw")
    (workspace / "_binary_vectors.mmap").write_bytes(b"mmap")

    status = get_embeddings_status(workspace)
    assert status["success"] is True

    result = status["result"]
    assert result["coverage_percent"] == 0.0
    assert result["files_with_embeddings"] == 0
    assert result["root"]["has_embeddings"] is False
    assert result["centralized"]["chunk_metadata_rows"] == 0
    assert result["centralized"]["binary_vector_rows"] == 0
    assert result["centralized"]["usable"] is False