Enhance search functionality and indexing pipeline

- Updated `cmd_search` to include line numbers and content in search results. - Modified `IndexingPipeline` to handle start and end line numbers for chunks. - Enhanced `FTSEngine` to support storing line metadata in the database. - Improved `SearchPipeline` to return line numbers and full content in search results. - Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality. - Introduced a `.gitignore` file to exclude specific directories.
2026-03-18 18:48:48 +08:00 · 2026-03-17 14:55:27 +08:00
parent bfe5426b7e
commit 0f02b75be1
25 changed files with 2014 additions and 1482 deletions
--- a/codex-lens-v2/.gitignore
+++ b/codex-lens-v2/.gitignore
@@ -0,0 +1 @@
+.ace-tool/
--- a/codex-lens-v2/src/codexlens_search/bridge.py
+++ b/codex-lens-v2/src/codexlens_search/bridge.py
@@ -129,7 +129,14 @@ def cmd_search(args: argparse.Namespace) -> None:

    results = search.search(args.query, top_k=args.top_k)
    _json_output([
-        {"path": r.path, "score": r.score, "snippet": r.snippet}
+        {
+            "path": r.path,
+            "score": r.score,
+            "line": r.line,
+            "end_line": r.end_line,
+            "snippet": r.snippet,
+            "content": r.content,
+        }
        for r in results
    ])

--- a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py
+++ b/codex-lens-v2/src/codexlens_search/indexing/pipeline.py
@@ -146,14 +146,16 @@ class IndexingPipeline:
            batch_ids = []
            batch_texts = []
            batch_paths = []
-            for chunk_text, path in file_chunks:
+            batch_lines: list[tuple[int, int]] = []
+            for chunk_text, path, sl, el in file_chunks:
                batch_ids.append(chunk_id)
                batch_texts.append(chunk_text)
                batch_paths.append(path)
+                batch_lines.append((sl, el))
                chunk_id += 1

            chunks_created += len(batch_ids)
-            embed_queue.put((batch_ids, batch_texts, batch_paths))
+            embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines))

        # Signal embed worker: no more data
        embed_queue.put(_SENTINEL)
@@ -203,12 +205,12 @@ class IndexingPipeline:
                if item is _SENTINEL:
                    break

-                batch_ids, batch_texts, batch_paths = item
+                batch_ids, batch_texts, batch_paths, batch_lines = item
                try:
                    vecs = self._embedder.embed_batch(batch_texts)
                    vec_array = np.array(vecs, dtype=np.float32)
                    id_array = np.array(batch_ids, dtype=np.int64)
-                    out_q.put((id_array, vec_array, batch_texts, batch_paths))
+                    out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines))
                except Exception as exc:
                    logger.error("Embed worker error: %s", exc)
                    on_error(exc)
@@ -221,19 +223,20 @@ class IndexingPipeline:
        in_q: queue.Queue,
        on_error: callable,
    ) -> None:
-        """Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
+        """Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores."""
        while True:
            item = in_q.get()
            if item is _SENTINEL:
                break

-            id_array, vec_array, texts, paths = item
+            id_array, vec_array, texts, paths, line_ranges = item
            try:
                self._binary_store.add(id_array, vec_array)
                self._ann_index.add(id_array, vec_array)

                fts_docs = [
-                    (int(id_array[i]), paths[i], texts[i])
+                    (int(id_array[i]), paths[i], texts[i],
+                     line_ranges[i][0], line_ranges[i][1])
                    for i in range(len(id_array))
                ]
                self._fts.add_documents(fts_docs)
@@ -251,32 +254,39 @@ class IndexingPipeline:
        path: str,
        max_chars: int,
        overlap: int,
-    ) -> list[tuple[str, str]]:
+    ) -> list[tuple[str, str, int, int]]:
        """Split file text into overlapping chunks.

-        Returns list of (chunk_text, path) tuples.
+        Returns list of (chunk_text, path, start_line, end_line) tuples.
+        Line numbers are 1-based.
        """
        if not text.strip():
            return []

-        chunks: list[tuple[str, str]] = []
+        chunks: list[tuple[str, str, int, int]] = []
        lines = text.splitlines(keepends=True)
        current: list[str] = []
        current_len = 0
+        chunk_start_line = 1  # 1-based
+        lines_consumed = 0

        for line in lines:
+            lines_consumed += 1
            if current_len + len(line) > max_chars and current:
                chunk = "".join(current)
-                chunks.append((chunk, path))
+                end_line = lines_consumed - 1
+                chunks.append((chunk, path, chunk_start_line, end_line))
                # overlap: keep last N characters
-                tail = "".join(current)[-overlap:]
+                tail = chunk[-overlap:] if overlap else ""
+                tail_newlines = tail.count("\n")
+                chunk_start_line = max(1, end_line - tail_newlines + 1)
                current = [tail] if tail else []
                current_len = len(tail)
            current.append(line)
            current_len += len(line)

        if current:
-            chunks.append(("".join(current), path))
+            chunks.append(("".join(current), path, chunk_start_line, lines_consumed))

        return chunks

@@ -370,10 +380,12 @@ class IndexingPipeline:
        batch_ids = []
        batch_texts = []
        batch_paths = []
-        for i, (chunk_text, path) in enumerate(file_chunks):
+        batch_lines: list[tuple[int, int]] = []
+        for i, (chunk_text, path, sl, el) in enumerate(file_chunks):
            batch_ids.append(start_id + i)
            batch_texts.append(chunk_text)
            batch_paths.append(path)
+            batch_lines.append((sl, el))

        # Embed synchronously
        vecs = self._embedder.embed_batch(batch_texts)
@@ -384,7 +396,8 @@ class IndexingPipeline:
        self._binary_store.add(id_array, vec_array)
        self._ann_index.add(id_array, vec_array)
        fts_docs = [
-            (batch_ids[i], batch_paths[i], batch_texts[i])
+            (batch_ids[i], batch_paths[i], batch_texts[i],
+             batch_lines[i][0], batch_lines[i][1])
            for i in range(len(batch_ids))
        ]
        self._fts.add_documents(fts_docs)
--- a/codex-lens-v2/src/codexlens_search/search/fts.py
+++ b/codex-lens-v2/src/codexlens_search/search/fts.py
@@ -13,21 +13,50 @@ class FTSEngine:
        )
        self._conn.execute(
            "CREATE TABLE IF NOT EXISTS docs_meta "
-            "(id INTEGER PRIMARY KEY, path TEXT)"
+            "(id INTEGER PRIMARY KEY, path TEXT, "
+            "start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
        )
        self._conn.commit()
+        self._migrate_line_columns()

-    def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
-        """Add documents in batch. docs: list of (id, path, content)."""
+    def _migrate_line_columns(self) -> None:
+        """Add start_line/end_line columns if missing (for pre-existing DBs)."""
+        cols = {
+            row[1]
+            for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
+        }
+        for col in ("start_line", "end_line"):
+            if col not in cols:
+                self._conn.execute(
+                    f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
+                )
+        self._conn.commit()
+
+    def add_documents(self, docs: list[tuple]) -> None:
+        """Add documents in batch.
+
+        docs: list of (id, path, content) or (id, path, content, start_line, end_line).
+        """
        if not docs:
            return
+        meta_rows = []
+        fts_rows = []
+        for doc in docs:
+            if len(doc) >= 5:
+                doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
+            else:
+                doc_id, path, content = doc[0], doc[1], doc[2]
+                sl, el = 0, 0
+            meta_rows.append((doc_id, path, sl, el))
+            fts_rows.append((doc_id, content))
        self._conn.executemany(
-            "INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
-            [(doc_id, path) for doc_id, path, content in docs],
+            "INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
+            "VALUES (?, ?, ?, ?)",
+            meta_rows,
        )
        self._conn.executemany(
            "INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
-            [(doc_id, content) for doc_id, path, content in docs],
+            fts_rows,
        )
        self._conn.commit()

@@ -92,3 +121,13 @@ class FTSEngine:
        )
        self._conn.commit()
        return len(ids)
+
+    def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
+        """Return (path, start_line, end_line) for a doc_id."""
+        row = self._conn.execute(
+            "SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
+            (doc_id,),
+        ).fetchone()
+        if row:
+            return row[0], row[1] or 0, row[2] or 0
+        return "", 0, 0
--- a/codex-lens-v2/src/codexlens_search/search/pipeline.py
+++ b/codex-lens-v2/src/codexlens_search/search/pipeline.py
@@ -28,6 +28,9 @@ class SearchResult:
    path: str
    score: float
    snippet: str = ""
+    line: int = 0
+    end_line: int = 0
+    content: str = ""


 class SearchPipeline:
@@ -162,15 +165,17 @@ class SearchPipeline:

        results: list[SearchResult] = []
        for doc_id, score in ranked[:final_top_k]:
-            path = self._fts._conn.execute(
-                "SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
-            ).fetchone()
+            path, start_line, end_line = self._fts.get_doc_meta(doc_id)
+            full_content = self._fts.get_content(doc_id)
            results.append(
                SearchResult(
                    id=doc_id,
-                    path=path[0] if path else "",
+                    path=path,
                    score=float(score),
-                    snippet=self._fts.get_content(doc_id)[:200],
+                    snippet=full_content[:200],
+                    line=start_line,
+                    end_line=end_line,
+                    content=full_content,
                )
            )
        return results
--- a/codex-lens-v2/tests/unit/test_bridge.py
+++ b/codex-lens-v2/tests/unit/test_bridge.py
@@ -0,0 +1,152 @@
+"""Unit tests for bridge.py CLI — argparse parsing, JSON protocol, error handling."""
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from codexlens_search.bridge import _build_parser, _json_output, _error_exit
+
+
+# ---------------------------------------------------------------------------
+# Parser construction
+# ---------------------------------------------------------------------------
+
+class TestParser:
+    @pytest.fixture(autouse=True)
+    def _parser(self):
+        self.parser = _build_parser()
+
+    def test_all_subcommands_exist(self):
+        expected = {
+            "init", "search", "index-file", "remove-file",
+            "sync", "watch", "download-models", "status",
+        }
+        # parse each subcommand with minimal required args to verify it exists
+        for cmd in expected:
+            if cmd == "search":
+                args = self.parser.parse_args(["search", "--query", "test"])
+            elif cmd == "index-file":
+                args = self.parser.parse_args(["index-file", "--file", "x.py"])
+            elif cmd == "remove-file":
+                args = self.parser.parse_args(["remove-file", "--file", "x.py"])
+            elif cmd == "sync":
+                args = self.parser.parse_args(["sync", "--root", "/tmp"])
+            elif cmd == "watch":
+                args = self.parser.parse_args(["watch", "--root", "/tmp"])
+            else:
+                args = self.parser.parse_args([cmd])
+            assert args.command == cmd
+
+    def test_global_db_path_default(self):
+        args = self.parser.parse_args(["status"])
+        assert args.db_path  # has a default
+
+    def test_global_db_path_override(self):
+        args = self.parser.parse_args(["--db-path", "/custom/path", "status"])
+        assert args.db_path == "/custom/path"
+
+    def test_search_args(self):
+        args = self.parser.parse_args(["search", "-q", "hello", "-k", "5"])
+        assert args.query == "hello"
+        assert args.top_k == 5
+
+    def test_search_default_top_k(self):
+        args = self.parser.parse_args(["search", "--query", "test"])
+        assert args.top_k == 10
+
+    def test_sync_glob_default(self):
+        args = self.parser.parse_args(["sync", "--root", "/tmp"])
+        assert args.glob == "**/*"
+
+    def test_watch_debounce_default(self):
+        args = self.parser.parse_args(["watch", "--root", "/tmp"])
+        assert args.debounce_ms == 500
+
+    def test_no_command_returns_none(self):
+        args = self.parser.parse_args([])
+        assert args.command is None
+
+
+# ---------------------------------------------------------------------------
+# JSON output helpers
+# ---------------------------------------------------------------------------
+
+class TestJsonHelpers:
+    def test_json_output(self, capsys):
+        _json_output({"key": "value"})
+        out = capsys.readouterr().out.strip()
+        parsed = json.loads(out)
+        assert parsed == {"key": "value"}
+
+    def test_json_output_list(self, capsys):
+        _json_output([1, 2, 3])
+        out = capsys.readouterr().out.strip()
+        assert json.loads(out) == [1, 2, 3]
+
+    def test_json_output_unicode(self, capsys):
+        _json_output({"msg": "中文测试"})
+        out = capsys.readouterr().out.strip()
+        assert "中文测试" in out
+
+    def test_error_exit(self):
+        with pytest.raises(SystemExit) as exc_info:
+            _error_exit("something broke")
+        assert exc_info.value.code == 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_init (lightweight, no model loading)
+# ---------------------------------------------------------------------------
+
+class TestCmdInit:
+    def test_init_creates_databases(self, tmp_path):
+        """Init should create metadata.db and fts.db."""
+        from codexlens_search.bridge import cmd_init
+        import argparse
+
+        db_path = str(tmp_path / "test_idx")
+        args = argparse.Namespace(db_path=db_path, verbose=False)
+        cmd_init(args)
+
+        assert (Path(db_path) / "metadata.db").exists()
+        assert (Path(db_path) / "fts.db").exists()
+
+
+# ---------------------------------------------------------------------------
+# cmd_status (lightweight, no model loading)
+# ---------------------------------------------------------------------------
+
+class TestCmdStatus:
+    def test_status_not_initialized(self, tmp_path, capsys):
+        from codexlens_search.bridge import cmd_status
+        import argparse
+
+        db_path = str(tmp_path / "empty_idx")
+        Path(db_path).mkdir()
+        args = argparse.Namespace(db_path=db_path, verbose=False)
+        cmd_status(args)
+
+        out = json.loads(capsys.readouterr().out.strip())
+        assert out["status"] == "not_initialized"
+
+    def test_status_after_init(self, tmp_path, capsys):
+        from codexlens_search.bridge import cmd_init, cmd_status
+        import argparse
+
+        db_path = str(tmp_path / "idx")
+        args = argparse.Namespace(db_path=db_path, verbose=False)
+        cmd_init(args)
+
+        # Re-capture after init output
+        capsys.readouterr()
+
+        cmd_status(args)
+        out = json.loads(capsys.readouterr().out.strip())
+        assert out["status"] == "ok"
+        assert out["files_tracked"] == 0
+        assert out["deleted_chunks"] == 0
--- a/codex-lens-v2/tests/unit/test_fts_delete.py
+++ b/codex-lens-v2/tests/unit/test_fts_delete.py
@@ -0,0 +1,66 @@
+"""Unit tests for FTSEngine delete_by_path and get_chunk_ids_by_path."""
+from __future__ import annotations
+
+import pytest
+
+from codexlens_search.search.fts import FTSEngine
+
+
+@pytest.fixture
+def fts(tmp_path):
+    return FTSEngine(str(tmp_path / "fts.db"))
+
+
+class TestGetChunkIdsByPath:
+    def test_empty(self, fts):
+        assert fts.get_chunk_ids_by_path("a.py") == []
+
+    def test_returns_matching_ids(self, fts):
+        fts.add_documents([
+            (0, "a.py", "hello world"),
+            (1, "a.py", "foo bar"),
+            (2, "b.py", "other content"),
+        ])
+        ids = fts.get_chunk_ids_by_path("a.py")
+        assert sorted(ids) == [0, 1]
+
+    def test_no_match(self, fts):
+        fts.add_documents([(0, "a.py", "content")])
+        assert fts.get_chunk_ids_by_path("b.py") == []
+
+
+class TestDeleteByPath:
+    def test_deletes_docs_and_meta(self, fts):
+        fts.add_documents([
+            (0, "target.py", "to be deleted"),
+            (1, "target.py", "also deleted"),
+            (2, "keep.py", "keep this"),
+        ])
+        count = fts.delete_by_path("target.py")
+        assert count == 2
+
+        # target.py gone from both tables
+        assert fts.get_chunk_ids_by_path("target.py") == []
+        assert fts.get_content(0) == ""
+        assert fts.get_content(1) == ""
+
+        # keep.py still there
+        assert fts.get_chunk_ids_by_path("keep.py") == [2]
+        assert fts.get_content(2) == "keep this"
+
+    def test_delete_nonexistent_path(self, fts):
+        count = fts.delete_by_path("nonexistent.py")
+        assert count == 0
+
+    def test_delete_then_search(self, fts):
+        fts.add_documents([
+            (0, "a.py", "unique searchable content"),
+            (1, "b.py", "different content here"),
+        ])
+        fts.delete_by_path("a.py")
+        results = fts.exact_search("unique searchable")
+        assert len(results) == 0
+
+        results = fts.exact_search("different")
+        assert len(results) == 1
+        assert results[0][0] == 1
--- a/codex-lens-v2/tests/unit/test_metadata_store.py
+++ b/codex-lens-v2/tests/unit/test_metadata_store.py
@@ -0,0 +1,184 @@
+"""Unit tests for MetadataStore — SQLite file-to-chunk mapping + tombstone tracking."""
+from __future__ import annotations
+
+import pytest
+
+from codexlens_search.indexing.metadata import MetadataStore
+
+
+@pytest.fixture
+def store(tmp_path):
+    """Create a fresh MetadataStore backed by a temp db."""
+    return MetadataStore(str(tmp_path / "meta.db"))
+
+
+# ---------------------------------------------------------------------------
+# Table creation
+# ---------------------------------------------------------------------------
+
+class TestTableCreation:
+    def test_creates_three_tables(self, store):
+        """MetadataStore should create files, chunks, deleted_chunks tables."""
+        tables = store._conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
+        ).fetchall()
+        names = {r[0] for r in tables}
+        assert "files" in names
+        assert "chunks" in names
+        assert "deleted_chunks" in names
+
+    def test_foreign_keys_enabled(self, store):
+        """PRAGMA foreign_keys must be ON."""
+        row = store._conn.execute("PRAGMA foreign_keys").fetchone()
+        assert row[0] == 1
+
+    def test_wal_mode(self, store):
+        """journal_mode should be WAL for concurrency."""
+        row = store._conn.execute("PRAGMA journal_mode").fetchone()
+        assert row[0].lower() == "wal"
+
+
+# ---------------------------------------------------------------------------
+# register_file
+# ---------------------------------------------------------------------------
+
+class TestRegisterFile:
+    def test_register_and_retrieve(self, store):
+        store.register_file("src/main.py", "abc123", 1000.0)
+        assert store.get_file_hash("src/main.py") == "abc123"
+
+    def test_register_updates_existing(self, store):
+        store.register_file("a.py", "hash1", 1000.0)
+        store.register_file("a.py", "hash2", 2000.0)
+        assert store.get_file_hash("a.py") == "hash2"
+
+    def test_get_file_hash_returns_none_for_unknown(self, store):
+        assert store.get_file_hash("nonexistent.py") is None
+
+
+# ---------------------------------------------------------------------------
+# register_chunks
+# ---------------------------------------------------------------------------
+
+class TestRegisterChunks:
+    def test_register_and_retrieve_chunks(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(0, "c0"), (1, "c1"), (2, "c2")])
+        ids = store.get_chunk_ids_for_file("a.py")
+        assert sorted(ids) == [0, 1, 2]
+
+    def test_empty_chunks_list(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [])
+        assert store.get_chunk_ids_for_file("a.py") == []
+
+    def test_chunks_for_unknown_file(self, store):
+        assert store.get_chunk_ids_for_file("unknown.py") == []
+
+
+# ---------------------------------------------------------------------------
+# mark_file_deleted
+# ---------------------------------------------------------------------------
+
+class TestMarkFileDeleted:
+    def test_tombstones_chunks(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(10, "c10"), (11, "c11")])
+        count = store.mark_file_deleted("a.py")
+        assert count == 2
+        assert store.get_deleted_ids() == {10, 11}
+
+    def test_file_removed_after_delete(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(0, "c0")])
+        store.mark_file_deleted("a.py")
+        assert store.get_file_hash("a.py") is None
+
+    def test_chunks_cascaded_after_delete(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(0, "c0")])
+        store.mark_file_deleted("a.py")
+        assert store.get_chunk_ids_for_file("a.py") == []
+
+    def test_delete_nonexistent_file(self, store):
+        count = store.mark_file_deleted("nonexistent.py")
+        assert count == 0
+
+    def test_delete_file_without_chunks(self, store):
+        store.register_file("empty.py", "h", 1.0)
+        count = store.mark_file_deleted("empty.py")
+        assert count == 0
+        assert store.get_file_hash("empty.py") is None
+
+
+# ---------------------------------------------------------------------------
+# file_needs_update
+# ---------------------------------------------------------------------------
+
+class TestFileNeedsUpdate:
+    def test_new_file_needs_update(self, store):
+        assert store.file_needs_update("new.py", "any_hash") is True
+
+    def test_unchanged_file(self, store):
+        store.register_file("a.py", "same_hash", 1.0)
+        assert store.file_needs_update("a.py", "same_hash") is False
+
+    def test_changed_file(self, store):
+        store.register_file("a.py", "old_hash", 1.0)
+        assert store.file_needs_update("a.py", "new_hash") is True
+
+
+# ---------------------------------------------------------------------------
+# get_deleted_ids / compact_deleted
+# ---------------------------------------------------------------------------
+
+class TestDeletedIdsAndCompact:
+    def test_empty_deleted_ids(self, store):
+        assert store.get_deleted_ids() == set()
+
+    def test_compact_returns_and_clears(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(5, "c5"), (6, "c6")])
+        store.mark_file_deleted("a.py")
+
+        deleted = store.compact_deleted()
+        assert deleted == {5, 6}
+        assert store.get_deleted_ids() == set()
+
+    def test_compact_noop_when_empty(self, store):
+        deleted = store.compact_deleted()
+        assert deleted == set()
+
+
+# ---------------------------------------------------------------------------
+# get_all_files / max_chunk_id
+# ---------------------------------------------------------------------------
+
+class TestHelpers:
+    def test_get_all_files(self, store):
+        store.register_file("a.py", "h1", 1.0)
+        store.register_file("b.py", "h2", 2.0)
+        assert store.get_all_files() == {"a.py": "h1", "b.py": "h2"}
+
+    def test_max_chunk_id_empty(self, store):
+        assert store.max_chunk_id() == -1
+
+    def test_max_chunk_id_active(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(0, "c"), (5, "c"), (3, "c")])
+        assert store.max_chunk_id() == 5
+
+    def test_max_chunk_id_includes_deleted(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(10, "c")])
+        store.mark_file_deleted("a.py")
+        assert store.max_chunk_id() == 10
+
+    def test_max_chunk_id_mixed(self, store):
+        store.register_file("a.py", "h", 1.0)
+        store.register_chunks("a.py", [(3, "c")])
+        store.register_file("b.py", "h2", 1.0)
+        store.register_chunks("b.py", [(7, "c")])
+        store.mark_file_deleted("a.py")
+        # deleted has 3, active has 7
+        assert store.max_chunk_id() == 7
--- a/codex-lens-v2/tests/unit/test_watcher.py
+++ b/codex-lens-v2/tests/unit/test_watcher.py
@@ -0,0 +1,270 @@
+"""Unit tests for watcher module — events, FileWatcher debounce/dedup, IncrementalIndexer."""
+from __future__ import annotations
+
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from codexlens_search.watcher.events import ChangeType, FileEvent, WatcherConfig
+from codexlens_search.watcher.incremental_indexer import BatchResult, IncrementalIndexer
+
+
+# ---------------------------------------------------------------------------
+# ChangeType enum
+# ---------------------------------------------------------------------------
+
+class TestChangeType:
+    def test_values(self):
+        assert ChangeType.CREATED.value == "created"
+        assert ChangeType.MODIFIED.value == "modified"
+        assert ChangeType.DELETED.value == "deleted"
+
+    def test_all_members(self):
+        assert len(ChangeType) == 3
+
+
+# ---------------------------------------------------------------------------
+# FileEvent
+# ---------------------------------------------------------------------------
+
+class TestFileEvent:
+    def test_creation(self):
+        e = FileEvent(path=Path("a.py"), change_type=ChangeType.CREATED)
+        assert e.path == Path("a.py")
+        assert e.change_type == ChangeType.CREATED
+        assert isinstance(e.timestamp, float)
+
+    def test_custom_timestamp(self):
+        e = FileEvent(path=Path("b.py"), change_type=ChangeType.DELETED, timestamp=42.0)
+        assert e.timestamp == 42.0
+
+
+# ---------------------------------------------------------------------------
+# WatcherConfig
+# ---------------------------------------------------------------------------
+
+class TestWatcherConfig:
+    def test_defaults(self):
+        cfg = WatcherConfig()
+        assert cfg.debounce_ms == 500
+        assert ".git" in cfg.ignored_patterns
+        assert "__pycache__" in cfg.ignored_patterns
+        assert "node_modules" in cfg.ignored_patterns
+
+    def test_custom(self):
+        cfg = WatcherConfig(debounce_ms=1000, ignored_patterns={".custom"})
+        assert cfg.debounce_ms == 1000
+        assert cfg.ignored_patterns == {".custom"}
+
+
+# ---------------------------------------------------------------------------
+# BatchResult
+# ---------------------------------------------------------------------------
+
+class TestBatchResult:
+    def test_defaults(self):
+        r = BatchResult()
+        assert r.files_indexed == 0
+        assert r.files_removed == 0
+        assert r.chunks_created == 0
+        assert r.errors == []
+
+    def test_total_processed(self):
+        r = BatchResult(files_indexed=3, files_removed=2)
+        assert r.total_processed == 5
+
+    def test_has_errors(self):
+        r = BatchResult()
+        assert r.has_errors is False
+        r.errors.append("oops")
+        assert r.has_errors is True
+
+
+# ---------------------------------------------------------------------------
+# IncrementalIndexer — event routing
+# ---------------------------------------------------------------------------
+
+class TestIncrementalIndexer:
+    @pytest.fixture
+    def mock_pipeline(self):
+        pipeline = MagicMock()
+        pipeline.index_file.return_value = MagicMock(
+            files_processed=1, chunks_created=3
+        )
+        return pipeline
+
+    def test_routes_created_to_index_file(self, mock_pipeline):
+        indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
+        events = [
+            FileEvent(Path("/project/src/new.py"), ChangeType.CREATED),
+        ]
+        result = indexer.process_events(events)
+        assert result.files_indexed == 1
+        mock_pipeline.index_file.assert_called_once()
+        # CREATED should NOT use force=True
+        call_kwargs = mock_pipeline.index_file.call_args
+        assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is False
+
+    def test_routes_modified_to_index_file_with_force(self, mock_pipeline):
+        indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
+        events = [
+            FileEvent(Path("/project/src/changed.py"), ChangeType.MODIFIED),
+        ]
+        result = indexer.process_events(events)
+        assert result.files_indexed == 1
+        call_kwargs = mock_pipeline.index_file.call_args
+        assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is True
+
+    def test_routes_deleted_to_remove_file(self, mock_pipeline, tmp_path):
+        root = tmp_path / "project"
+        root.mkdir()
+        indexer = IncrementalIndexer(mock_pipeline, root=root)
+        events = [
+            FileEvent(root / "src" / "old.py", ChangeType.DELETED),
+        ]
+        result = indexer.process_events(events)
+        assert result.files_removed == 1
+        # On Windows relative_to produces backslashes, normalize
+        actual_arg = mock_pipeline.remove_file.call_args[0][0]
+        assert actual_arg.replace("\\", "/") == "src/old.py"
+
+    def test_batch_with_mixed_events(self, mock_pipeline):
+        indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
+        events = [
+            FileEvent(Path("/project/a.py"), ChangeType.CREATED),
+            FileEvent(Path("/project/b.py"), ChangeType.MODIFIED),
+            FileEvent(Path("/project/c.py"), ChangeType.DELETED),
+        ]
+        result = indexer.process_events(events)
+        assert result.files_indexed == 2
+        assert result.files_removed == 1
+        assert result.total_processed == 3
+
+    def test_error_isolation(self, mock_pipeline):
+        """One file failure should not stop processing of others."""
+        call_count = [0]
+
+        def side_effect(*args, **kwargs):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                raise RuntimeError("disk error")
+            return MagicMock(files_processed=1, chunks_created=1)
+
+        mock_pipeline.index_file.side_effect = side_effect
+
+        indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
+        events = [
+            FileEvent(Path("/project/fail.py"), ChangeType.CREATED),
+            FileEvent(Path("/project/ok.py"), ChangeType.CREATED),
+        ]
+        result = indexer.process_events(events)
+
+        assert result.files_indexed == 1  # second succeeded
+        assert len(result.errors) == 1  # first failed
+        assert "disk error" in result.errors[0]
+
+    def test_empty_events(self, mock_pipeline):
+        indexer = IncrementalIndexer(mock_pipeline)
+        result = indexer.process_events([])
+        assert result.total_processed == 0
+        mock_pipeline.index_file.assert_not_called()
+        mock_pipeline.remove_file.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# FileWatcher — debounce and dedup logic (unit-level, no actual FS)
+# ---------------------------------------------------------------------------
+
+class TestFileWatcherLogic:
+    """Test FileWatcher internals without starting a real watchdog Observer."""
+
+    @pytest.fixture
+    def watcher_parts(self):
+        """Create a FileWatcher with mocked observer, capture callbacks."""
+        # Import here since watchdog is optional
+        from codexlens_search.watcher.file_watcher import FileWatcher, _EVENT_PRIORITY
+
+        collected = []
+
+        def on_changes(events):
+            collected.extend(events)
+
+        cfg = WatcherConfig(debounce_ms=100)
+        watcher = FileWatcher(Path("."), cfg, on_changes)
+        return watcher, collected, _EVENT_PRIORITY
+
+    def test_event_priority_ordering(self, watcher_parts):
+        _, _, priority = watcher_parts
+        assert priority[ChangeType.DELETED] > priority[ChangeType.MODIFIED]
+        assert priority[ChangeType.MODIFIED] > priority[ChangeType.CREATED]
+
+    def test_dedup_keeps_higher_priority(self, watcher_parts, tmp_path):
+        watcher, collected, _ = watcher_parts
+        f = str(tmp_path / "a.py")
+        watcher._on_raw_event(f, ChangeType.CREATED)
+        watcher._on_raw_event(f, ChangeType.DELETED)
+
+        watcher.flush_now()
+
+        assert len(collected) == 1
+        assert collected[0].change_type == ChangeType.DELETED
+
+    def test_dedup_does_not_downgrade(self, watcher_parts, tmp_path):
+        watcher, collected, _ = watcher_parts
+        f = str(tmp_path / "b.py")
+        watcher._on_raw_event(f, ChangeType.DELETED)
+        watcher._on_raw_event(f, ChangeType.CREATED)
+
+        watcher.flush_now()
+        assert len(collected) == 1
+        # CREATED (priority 1) < DELETED (priority 3), so DELETED stays
+        assert collected[0].change_type == ChangeType.DELETED
+
+    def test_multiple_files_kept(self, watcher_parts, tmp_path):
+        watcher, collected, _ = watcher_parts
+        watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED)
+        watcher._on_raw_event(str(tmp_path / "b.py"), ChangeType.MODIFIED)
+        watcher._on_raw_event(str(tmp_path / "c.py"), ChangeType.DELETED)
+
+        watcher.flush_now()
+        assert len(collected) == 3
+        paths = {str(e.path) for e in collected}
+        assert len(paths) == 3
+
+    def test_flush_clears_pending(self, watcher_parts, tmp_path):
+        watcher, collected, _ = watcher_parts
+        watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED)
+        watcher.flush_now()
+        assert len(collected) == 1
+
+        collected.clear()
+        watcher.flush_now()
+        assert len(collected) == 0
+
+    def test_should_watch_filters_ignored(self, watcher_parts):
+        watcher, _, _ = watcher_parts
+        assert watcher._should_watch(Path("/project/src/main.py")) is True
+        assert watcher._should_watch(Path("/project/.git/config")) is False
+        assert watcher._should_watch(Path("/project/node_modules/foo.js")) is False
+        assert watcher._should_watch(Path("/project/__pycache__/mod.pyc")) is False
+
+    def test_jsonl_serialization(self):
+        from codexlens_search.watcher.file_watcher import FileWatcher
+        import json
+
+        events = [
+            FileEvent(Path("/tmp/a.py"), ChangeType.CREATED, 1000.0),
+            FileEvent(Path("/tmp/b.py"), ChangeType.DELETED, 2000.0),
+        ]
+        output = FileWatcher.events_to_jsonl(events)
+        lines = output.strip().split("\n")
+        assert len(lines) == 2
+
+        obj1 = json.loads(lines[0])
+        assert obj1["change_type"] == "created"
+        assert obj1["timestamp"] == 1000.0
+
+        obj2 = json.loads(lines[1])
+        assert obj2["change_type"] == "deleted"